From d4261e5650004d6d51137553ea5433d5828562dc Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 19 Aug 2014 16:02:12 +0200 Subject: bonding: create netlink event when bonding option is changed Userspace needs to be notified if one changes some option. Signed-off-by: Jiri Pirko Acked-by: Veaceslav Falico Acked-by: Andy Gospodarek Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 38377392d082..7e2b0b8b5cd7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1982,6 +1982,7 @@ struct pcpu_sw_netstats { #define NETDEV_CHANGEUPPER 0x0015 #define NETDEV_RESEND_IGMP 0x0016 #define NETDEV_PRECHANGEMTU 0x0017 /* notify before mtu change happened */ +#define NETDEV_CHANGEINFODATA 0x0018 int register_netdevice_notifier(struct notifier_block *nb); int unregister_netdevice_notifier(struct notifier_block *nb); -- cgit v1.2.3-59-g8ed1b From 989e04c5bc3ff77d65e1f0d87bf7904dfa30d41c Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 22 Aug 2014 14:15:22 -0700 Subject: tcp: improve undo on timeout Upon timeout, undo (via both timestamps/Eifel and DSACKs) was disabled if any retransmits were still in flight. The concern was perhaps that spurious retransmission sent in a previous recovery episode may trigger DSACKs to falsely undo the current recovery. However, this inadvertently misses undo opportunities (using either TCP timestamps or DSACKs) when timeout occurs during a loss episode, i.e. recurring timeouts or timeout during fast recovery. In these cases some retransmissions will be in flight but we should allow undo. Furthermore, we should only reset undo_marker and undo_retrans upon timeout if we are starting a new recovery episode. Finally, when we do reset our undo state, we now do so in a manner similar to tcp_enter_recovery(), so that we require a DSACK for each of the outstsanding retransmissions. This will achieve the original goal by requiring that we receive the same number of DSACKs as retransmissions. This patch increases the undo events by 50% on Google servers. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 +- net/ipv4/tcp_input.c | 26 +++++++++++--------------- 2 files changed, 12 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index fa5258f322e7..e567f0dbf282 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -276,7 +276,7 @@ struct tcp_sock { u32 retrans_stamp; /* Timestamp of the last retransmit, * also used in SYN-SENT to remember stamp of * the first SYN. */ - u32 undo_marker; /* tracking retrans started here. */ + u32 undo_marker; /* snd_una upon a new recovery episode. */ int undo_retrans; /* number of undoable retransmissions. */ u32 total_retrans; /* Total retransmits for entire connection */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a906e0200ff2..aba4926ca095 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1888,21 +1888,21 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp) tp->sacked_out = 0; } -static void tcp_clear_retrans_partial(struct tcp_sock *tp) +void tcp_clear_retrans(struct tcp_sock *tp) { tp->retrans_out = 0; tp->lost_out = 0; - tp->undo_marker = 0; tp->undo_retrans = -1; + tp->fackets_out = 0; + tp->sacked_out = 0; } -void tcp_clear_retrans(struct tcp_sock *tp) +static inline void tcp_init_undo(struct tcp_sock *tp) { - tcp_clear_retrans_partial(tp); - - tp->fackets_out = 0; - tp->sacked_out = 0; + tp->undo_marker = tp->snd_una; + /* Retransmission still in flight may cause DSACKs later. */ + tp->undo_retrans = tp->retrans_out ? : -1; } /* Enter Loss state. If we detect SACK reneging, forget all SACK information @@ -1925,18 +1925,18 @@ void tcp_enter_loss(struct sock *sk) tp->prior_ssthresh = tcp_current_ssthresh(sk); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tcp_ca_event(sk, CA_EVENT_LOSS); + tcp_init_undo(tp); } tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; - tcp_clear_retrans_partial(tp); + tp->retrans_out = 0; + tp->lost_out = 0; if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); - tp->undo_marker = tp->snd_una; - skb = tcp_write_queue_head(sk); is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); if (is_reneg) { @@ -1950,9 +1950,6 @@ void tcp_enter_loss(struct sock *sk) if (skb == tcp_send_head(sk)) break; - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) - tp->undo_marker = 0; - TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; @@ -2671,8 +2668,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) NET_INC_STATS_BH(sock_net(sk), mib_idx); tp->prior_ssthresh = 0; - tp->undo_marker = tp->snd_una; - tp->undo_retrans = tp->retrans_out ? : -1; + tcp_init_undo(tp); if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { if (!ece_ack) -- cgit v1.2.3-59-g8ed1b From 3af20efc0f83cdc65ce56ec108c0e81f602364df Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 22 Aug 2014 18:55:39 -0700 Subject: net: phy: broadcom: extract all registers to brcmphy.h Commit 439d39a9ac8fbbba9c04581361188f33f21ced50 ("net: phy: broadcom: extract register definitions") added a bunch of registers to brcmphy.h but left some to broadcom.c, move all of them to the header file since the BCM54xx and BCM7xxx PHY drivers do share all of these registers. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/broadcom.c | 104 --------------------------------------------- include/linux/brcmphy.h | 103 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 104 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index 34088d60da74..6001d76d8676 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -25,110 +25,6 @@ #define BRCM_PHY_REV(phydev) \ ((phydev)->drv->phy_id & ~((phydev)->drv->phy_id_mask)) -/* - * Broadcom LED source encodings. These are used in BCM5461, BCM5481, - * BCM5482, and possibly some others. - */ -#define BCM_LED_SRC_LINKSPD1 0x0 -#define BCM_LED_SRC_LINKSPD2 0x1 -#define BCM_LED_SRC_XMITLED 0x2 -#define BCM_LED_SRC_ACTIVITYLED 0x3 -#define BCM_LED_SRC_FDXLED 0x4 -#define BCM_LED_SRC_SLAVE 0x5 -#define BCM_LED_SRC_INTR 0x6 -#define BCM_LED_SRC_QUALITY 0x7 -#define BCM_LED_SRC_RCVLED 0x8 -#define BCM_LED_SRC_MULTICOLOR1 0xa -#define BCM_LED_SRC_OPENSHORT 0xb -#define BCM_LED_SRC_OFF 0xe /* Tied high */ -#define BCM_LED_SRC_ON 0xf /* Tied low */ - - -/* - * BCM5482: Shadow registers - * Shadow values go into bits [14:10] of register 0x1c to select a shadow - * register to access. - */ -/* 00101: Spare Control Register 3 */ -#define BCM54XX_SHD_SCR3 0x05 -#define BCM54XX_SHD_SCR3_DEF_CLK125 0x0001 -#define BCM54XX_SHD_SCR3_DLLAPD_DIS 0x0002 -#define BCM54XX_SHD_SCR3_TRDDAPD 0x0004 - -/* 01010: Auto Power-Down */ -#define BCM54XX_SHD_APD 0x0a -#define BCM54XX_SHD_APD_EN 0x0020 - -#define BCM5482_SHD_LEDS1 0x0d /* 01101: LED Selector 1 */ - /* LED3 / ~LINKSPD[2] selector */ -#define BCM5482_SHD_LEDS1_LED3(src) ((src & 0xf) << 4) - /* LED1 / ~LINKSPD[1] selector */ -#define BCM5482_SHD_LEDS1_LED1(src) ((src & 0xf) << 0) -#define BCM54XX_SHD_RGMII_MODE 0x0b /* 01011: RGMII Mode Selector */ -#define BCM5482_SHD_SSD 0x14 /* 10100: Secondary SerDes control */ -#define BCM5482_SHD_SSD_LEDM 0x0008 /* SSD LED Mode enable */ -#define BCM5482_SHD_SSD_EN 0x0001 /* SSD enable */ -#define BCM5482_SHD_MODE 0x1f /* 11111: Mode Control Register */ -#define BCM5482_SHD_MODE_1000BX 0x0001 /* Enable 1000BASE-X registers */ - - -/* - * EXPANSION SHADOW ACCESS REGISTERS. (PHY REG 0x15, 0x16, and 0x17) - */ -#define MII_BCM54XX_EXP_AADJ1CH0 0x001f -#define MII_BCM54XX_EXP_AADJ1CH0_SWP_ABCD_OEN 0x0200 -#define MII_BCM54XX_EXP_AADJ1CH0_SWSEL_THPF 0x0100 -#define MII_BCM54XX_EXP_AADJ1CH3 0x601f -#define MII_BCM54XX_EXP_AADJ1CH3_ADCCKADJ 0x0002 -#define MII_BCM54XX_EXP_EXP08 0x0F08 -#define MII_BCM54XX_EXP_EXP08_RJCT_2MHZ 0x0001 -#define MII_BCM54XX_EXP_EXP08_EARLY_DAC_WAKE 0x0200 -#define MII_BCM54XX_EXP_EXP75 0x0f75 -#define MII_BCM54XX_EXP_EXP75_VDACCTRL 0x003c -#define MII_BCM54XX_EXP_EXP75_CM_OSC 0x0001 -#define MII_BCM54XX_EXP_EXP96 0x0f96 -#define MII_BCM54XX_EXP_EXP96_MYST 0x0010 -#define MII_BCM54XX_EXP_EXP97 0x0f97 -#define MII_BCM54XX_EXP_EXP97_MYST 0x0c0c - -/* - * BCM5482: Secondary SerDes registers - */ -#define BCM5482_SSD_1000BX_CTL 0x00 /* 1000BASE-X Control */ -#define BCM5482_SSD_1000BX_CTL_PWRDOWN 0x0800 /* Power-down SSD */ -#define BCM5482_SSD_SGMII_SLAVE 0x15 /* SGMII Slave Register */ -#define BCM5482_SSD_SGMII_SLAVE_EN 0x0002 /* Slave mode enable */ -#define BCM5482_SSD_SGMII_SLAVE_AD 0x0001 /* Slave auto-detection */ - - -/*****************************************************************************/ -/* Fast Ethernet Transceiver definitions. */ -/*****************************************************************************/ - -#define MII_BRCM_FET_INTREG 0x1a /* Interrupt register */ -#define MII_BRCM_FET_IR_MASK 0x0100 /* Mask all interrupts */ -#define MII_BRCM_FET_IR_LINK_EN 0x0200 /* Link status change enable */ -#define MII_BRCM_FET_IR_SPEED_EN 0x0400 /* Link speed change enable */ -#define MII_BRCM_FET_IR_DUPLEX_EN 0x0800 /* Duplex mode change enable */ -#define MII_BRCM_FET_IR_ENABLE 0x4000 /* Interrupt enable */ - -#define MII_BRCM_FET_BRCMTEST 0x1f /* Brcm test register */ -#define MII_BRCM_FET_BT_SRE 0x0080 /* Shadow register enable */ - - -/*** Shadow register definitions ***/ - -#define MII_BRCM_FET_SHDW_MISCCTRL 0x10 /* Shadow misc ctrl */ -#define MII_BRCM_FET_SHDW_MC_FAME 0x4000 /* Force Auto MDIX enable */ - -#define MII_BRCM_FET_SHDW_AUXMODE4 0x1a /* Auxiliary mode 4 */ -#define MII_BRCM_FET_SHDW_AM4_LED_MASK 0x0003 -#define MII_BRCM_FET_SHDW_AM4_LED_MODE1 0x0001 - -#define MII_BRCM_FET_SHDW_AUXSTAT2 0x1b /* Auxiliary status 2 */ -#define MII_BRCM_FET_SHDW_AS2_APDE 0x0020 /* Auto power down enable */ - - MODULE_DESCRIPTION("Broadcom PHY driver"); MODULE_AUTHOR("Maciej W. Rozycki"); MODULE_LICENSE("GPL"); diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 61219b9b3445..be31bf9f60c2 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -92,4 +92,107 @@ #define MII_BCM54XX_AUXCTL_SHDWSEL_AUXCTL 0x0000 +/* + * Broadcom LED source encodings. These are used in BCM5461, BCM5481, + * BCM5482, and possibly some others. + */ +#define BCM_LED_SRC_LINKSPD1 0x0 +#define BCM_LED_SRC_LINKSPD2 0x1 +#define BCM_LED_SRC_XMITLED 0x2 +#define BCM_LED_SRC_ACTIVITYLED 0x3 +#define BCM_LED_SRC_FDXLED 0x4 +#define BCM_LED_SRC_SLAVE 0x5 +#define BCM_LED_SRC_INTR 0x6 +#define BCM_LED_SRC_QUALITY 0x7 +#define BCM_LED_SRC_RCVLED 0x8 +#define BCM_LED_SRC_MULTICOLOR1 0xa +#define BCM_LED_SRC_OPENSHORT 0xb +#define BCM_LED_SRC_OFF 0xe /* Tied high */ +#define BCM_LED_SRC_ON 0xf /* Tied low */ + + +/* + * BCM5482: Shadow registers + * Shadow values go into bits [14:10] of register 0x1c to select a shadow + * register to access. + */ +/* 00101: Spare Control Register 3 */ +#define BCM54XX_SHD_SCR3 0x05 +#define BCM54XX_SHD_SCR3_DEF_CLK125 0x0001 +#define BCM54XX_SHD_SCR3_DLLAPD_DIS 0x0002 +#define BCM54XX_SHD_SCR3_TRDDAPD 0x0004 + +/* 01010: Auto Power-Down */ +#define BCM54XX_SHD_APD 0x0a +#define BCM54XX_SHD_APD_EN 0x0020 + +#define BCM5482_SHD_LEDS1 0x0d /* 01101: LED Selector 1 */ + /* LED3 / ~LINKSPD[2] selector */ +#define BCM5482_SHD_LEDS1_LED3(src) ((src & 0xf) << 4) + /* LED1 / ~LINKSPD[1] selector */ +#define BCM5482_SHD_LEDS1_LED1(src) ((src & 0xf) << 0) +#define BCM54XX_SHD_RGMII_MODE 0x0b /* 01011: RGMII Mode Selector */ +#define BCM5482_SHD_SSD 0x14 /* 10100: Secondary SerDes control */ +#define BCM5482_SHD_SSD_LEDM 0x0008 /* SSD LED Mode enable */ +#define BCM5482_SHD_SSD_EN 0x0001 /* SSD enable */ +#define BCM5482_SHD_MODE 0x1f /* 11111: Mode Control Register */ +#define BCM5482_SHD_MODE_1000BX 0x0001 /* Enable 1000BASE-X registers */ + + +/* + * EXPANSION SHADOW ACCESS REGISTERS. (PHY REG 0x15, 0x16, and 0x17) + */ +#define MII_BCM54XX_EXP_AADJ1CH0 0x001f +#define MII_BCM54XX_EXP_AADJ1CH0_SWP_ABCD_OEN 0x0200 +#define MII_BCM54XX_EXP_AADJ1CH0_SWSEL_THPF 0x0100 +#define MII_BCM54XX_EXP_AADJ1CH3 0x601f +#define MII_BCM54XX_EXP_AADJ1CH3_ADCCKADJ 0x0002 +#define MII_BCM54XX_EXP_EXP08 0x0F08 +#define MII_BCM54XX_EXP_EXP08_RJCT_2MHZ 0x0001 +#define MII_BCM54XX_EXP_EXP08_EARLY_DAC_WAKE 0x0200 +#define MII_BCM54XX_EXP_EXP75 0x0f75 +#define MII_BCM54XX_EXP_EXP75_VDACCTRL 0x003c +#define MII_BCM54XX_EXP_EXP75_CM_OSC 0x0001 +#define MII_BCM54XX_EXP_EXP96 0x0f96 +#define MII_BCM54XX_EXP_EXP96_MYST 0x0010 +#define MII_BCM54XX_EXP_EXP97 0x0f97 +#define MII_BCM54XX_EXP_EXP97_MYST 0x0c0c + +/* + * BCM5482: Secondary SerDes registers + */ +#define BCM5482_SSD_1000BX_CTL 0x00 /* 1000BASE-X Control */ +#define BCM5482_SSD_1000BX_CTL_PWRDOWN 0x0800 /* Power-down SSD */ +#define BCM5482_SSD_SGMII_SLAVE 0x15 /* SGMII Slave Register */ +#define BCM5482_SSD_SGMII_SLAVE_EN 0x0002 /* Slave mode enable */ +#define BCM5482_SSD_SGMII_SLAVE_AD 0x0001 /* Slave auto-detection */ + + +/*****************************************************************************/ +/* Fast Ethernet Transceiver definitions. */ +/*****************************************************************************/ + +#define MII_BRCM_FET_INTREG 0x1a /* Interrupt register */ +#define MII_BRCM_FET_IR_MASK 0x0100 /* Mask all interrupts */ +#define MII_BRCM_FET_IR_LINK_EN 0x0200 /* Link status change enable */ +#define MII_BRCM_FET_IR_SPEED_EN 0x0400 /* Link speed change enable */ +#define MII_BRCM_FET_IR_DUPLEX_EN 0x0800 /* Duplex mode change enable */ +#define MII_BRCM_FET_IR_ENABLE 0x4000 /* Interrupt enable */ + +#define MII_BRCM_FET_BRCMTEST 0x1f /* Brcm test register */ +#define MII_BRCM_FET_BT_SRE 0x0080 /* Shadow register enable */ + + +/*** Shadow register definitions ***/ + +#define MII_BRCM_FET_SHDW_MISCCTRL 0x10 /* Shadow misc ctrl */ +#define MII_BRCM_FET_SHDW_MC_FAME 0x4000 /* Force Auto MDIX enable */ + +#define MII_BRCM_FET_SHDW_AUXMODE4 0x1a /* Auxiliary mode 4 */ +#define MII_BRCM_FET_SHDW_AM4_LED_MASK 0x0003 +#define MII_BRCM_FET_SHDW_AM4_LED_MODE1 0x0001 + +#define MII_BRCM_FET_SHDW_AUXSTAT2 0x1b /* Auxiliary status 2 */ +#define MII_BRCM_FET_SHDW_AS2_APDE 0x0020 /* Auto power down enable */ + #endif /* _LINUX_BRCMPHY_H */ -- cgit v1.2.3-59-g8ed1b From 705314797b8b997554b7e9d0ea7b65a497356e53 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 22 Aug 2014 18:55:40 -0700 Subject: net: phy: broadcom: move shadow 0x1C register accessors to brcmphy.h The shadow register 0x1C is used both by the BCM54xxx PHYs and the BCM7xxx internal PHYs, move the accessors to a common location so both drivers can use them. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/broadcom.c | 18 ------------------ include/linux/brcmphy.h | 20 ++++++++++++++++++++ 2 files changed, 20 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index 6001d76d8676..854f2c9a7b2b 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -29,24 +29,6 @@ MODULE_DESCRIPTION("Broadcom PHY driver"); MODULE_AUTHOR("Maciej W. Rozycki"); MODULE_LICENSE("GPL"); -/* - * Indirect register access functions for the 1000BASE-T/100BASE-TX/10BASE-T - * 0x1c shadow registers. - */ -static int bcm54xx_shadow_read(struct phy_device *phydev, u16 shadow) -{ - phy_write(phydev, MII_BCM54XX_SHD, MII_BCM54XX_SHD_VAL(shadow)); - return MII_BCM54XX_SHD_DATA(phy_read(phydev, MII_BCM54XX_SHD)); -} - -static int bcm54xx_shadow_write(struct phy_device *phydev, u16 shadow, u16 val) -{ - return phy_write(phydev, MII_BCM54XX_SHD, - MII_BCM54XX_SHD_WRITE | - MII_BCM54XX_SHD_VAL(shadow) | - MII_BCM54XX_SHD_DATA(val)); -} - /* Indirect register access functions for the Expansion Registers */ static int bcm54xx_exp_read(struct phy_device *phydev, u16 regnum) { diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index be31bf9f60c2..722cf26567fa 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -195,4 +195,24 @@ #define MII_BRCM_FET_SHDW_AUXSTAT2 0x1b /* Auxiliary status 2 */ #define MII_BRCM_FET_SHDW_AS2_APDE 0x0020 /* Auto power down enable */ +/* + * Indirect register access functions for the 1000BASE-T/100BASE-TX/10BASE-T + * 0x1c shadow registers. + */ +static inline int bcm54xx_shadow_read(struct phy_device *phydev, u16 shadow) +{ + phy_write(phydev, MII_BCM54XX_SHD, MII_BCM54XX_SHD_VAL(shadow)); + return MII_BCM54XX_SHD_DATA(phy_read(phydev, MII_BCM54XX_SHD)); +} + +static inline int bcm54xx_shadow_write(struct phy_device *phydev, u16 shadow, + u16 val) +{ + return phy_write(phydev, MII_BCM54XX_SHD, + MII_BCM54XX_SHD_WRITE | + MII_BCM54XX_SHD_VAL(shadow) | + MII_BCM54XX_SHD_DATA(val)); +} + + #endif /* _LINUX_BRCMPHY_H */ -- cgit v1.2.3-59-g8ed1b From 66ce7fb9807b036058aa380bfd2b3851ae25ce39 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 22 Aug 2014 18:55:43 -0700 Subject: net: phy: export phy_{read,write}_mmd_indirect Some PHY drivers might need to access Clause 45 registers in Clause 22 compatibility mode to e.g: properly advertise EEE support when disabled by default. Export these two helper functions: phy_read_mmd_indirect() and phy_write_mmd_indirect() for drivers to use them. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 6 ++++-- include/linux/phy.h | 27 +++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index c94e2a27446a..e7a5893f32ff 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -955,7 +955,7 @@ static inline void mmd_phy_indirect(struct mii_bus *bus, int prtad, int devad, * 3) Write reg 13 // MMD Data Command for MMD DEVAD * 3) Read reg 14 // Read MMD data */ -static int phy_read_mmd_indirect(struct phy_device *phydev, int prtad, +int phy_read_mmd_indirect(struct phy_device *phydev, int prtad, int devad, int addr) { struct phy_driver *phydrv = phydev->drv; @@ -971,6 +971,7 @@ static int phy_read_mmd_indirect(struct phy_device *phydev, int prtad, } return value; } +EXPORT_SYMBOL(phy_read_mmd_indirect); /** * phy_write_mmd_indirect - writes data to the MMD registers @@ -988,7 +989,7 @@ static int phy_read_mmd_indirect(struct phy_device *phydev, int prtad, * 3) Write reg 13 // MMD Data Command for MMD DEVAD * 3) Write reg 14 // Write MMD data */ -static void phy_write_mmd_indirect(struct phy_device *phydev, int prtad, +void phy_write_mmd_indirect(struct phy_device *phydev, int prtad, int devad, int addr, u32 data) { struct phy_driver *phydrv = phydev->drv; @@ -1002,6 +1003,7 @@ static void phy_write_mmd_indirect(struct phy_device *phydev, int prtad, phydrv->write_mmd_indirect(phydev, prtad, devad, addr, data); } } +EXPORT_SYMBOL(phy_write_mmd_indirect); /** * phy_init_eee - init and check the EEE feature diff --git a/include/linux/phy.h b/include/linux/phy.h index ed39956b5613..d090cfcaa167 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -597,6 +597,19 @@ static inline int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum) MII_ADDR_C45 | (devad << 16) | (regnum & 0xffff)); } +/** + * phy_read_mmd_indirect - reads data from the MMD registers + * @phydev: The PHY device bus + * @prtad: MMD Address + * @devad: MMD DEVAD + * @addr: PHY address on the MII bus + * + * Description: it reads data from the MMD registers (clause 22 to access to + * clause 45) of the specified phy address. + */ +int phy_read_mmd_indirect(struct phy_device *phydev, int prtad, + int devad, int addr); + /** * phy_read - Convenience function for reading a given PHY register * @phydev: the phy_device struct @@ -668,6 +681,20 @@ static inline int phy_write_mmd(struct phy_device *phydev, int devad, return mdiobus_write(phydev->bus, phydev->addr, regnum, val); } +/** + * phy_write_mmd_indirect - writes data to the MMD registers + * @phydev: The PHY device + * @prtad: MMD Address + * @devad: MMD DEVAD + * @addr: PHY address on the MII bus + * @data: data to write in the MMD register + * + * Description: Write data from the MMD registers of the specified + * phy address. + */ +void phy_write_mmd_indirect(struct phy_device *phydev, int prtad, + int devad, int addr, u32 data); + struct phy_device *phy_device_create(struct mii_bus *bus, int addr, int phy_id, bool is_c45, struct phy_c45_device_ids *c45_ids); -- cgit v1.2.3-59-g8ed1b From b8f9a02924bbeb0c46ca4c19561cbe765b80e264 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 22 Aug 2014 18:55:45 -0700 Subject: net: phy: bcm7xxx: enable EEE at the PHY level The 28nm Gigabit PHY on BCM7xxx chips comes out of reset with absolutely no EEE capabilities, such that we would actually return that we do not support EEE when accessing 3.20 (MDIO_PCS_EEE_ABLE) registers. Poke through the vendor-specific C45 register to enable EEE globally at the PHY level, and advertise supported EEE modes. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/bcm7xxx.c | 31 +++++++++++++++++++++++++++++++ include/linux/brcmphy.h | 3 +++ 2 files changed, 34 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c index 29e256a4ed57..e98c510b75c5 100644 --- a/drivers/net/phy/bcm7xxx.c +++ b/drivers/net/phy/bcm7xxx.c @@ -14,6 +14,7 @@ #include #include #include +#include /* Broadcom BCM7xxx internal PHY registers */ #define MII_BCM7XXX_CHANNEL_WIDTH 0x2000 @@ -167,6 +168,32 @@ static int bcm7xxx_apd_enable(struct phy_device *phydev) return bcm54xx_shadow_write(phydev, BCM54XX_SHD_APD, val); } +static int bcm7xxx_eee_enable(struct phy_device *phydev) +{ + int val; + + val = phy_read_mmd_indirect(phydev, BRCM_CL45VEN_EEE_CONTROL, + MDIO_MMD_AN, phydev->addr); + if (val < 0) + return val; + + /* Enable general EEE feature at the PHY level */ + val |= LPI_FEATURE_EN | LPI_FEATURE_EN_DIG1000X; + + phy_write_mmd_indirect(phydev, BRCM_CL45VEN_EEE_CONTROL, + MDIO_MMD_AN, phydev->addr, val); + + /* Advertise supported modes */ + val = phy_read_mmd_indirect(phydev, MDIO_AN_EEE_ADV, + MDIO_MMD_AN, phydev->addr); + + val |= (MDIO_AN_EEE_ADV_100TX | MDIO_AN_EEE_ADV_1000T); + phy_write_mmd_indirect(phydev, MDIO_AN_EEE_ADV, + MDIO_MMD_AN, phydev->addr, val); + + return 0; +} + static int bcm7xxx_28nm_config_init(struct phy_device *phydev) { int ret; @@ -179,6 +206,10 @@ static int bcm7xxx_28nm_config_init(struct phy_device *phydev) if (ret) return ret; + ret = bcm7xxx_eee_enable(phydev); + if (ret) + return ret; + return bcm7xxx_apd_enable(phydev); } diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 722cf26567fa..ee1431d976fa 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -214,5 +214,8 @@ static inline int bcm54xx_shadow_write(struct phy_device *phydev, u16 shadow, MII_BCM54XX_SHD_DATA(val)); } +#define BRCM_CL45VEN_EEE_CONTROL 0x803d +#define LPI_FEATURE_EN 0x8000 +#define LPI_FEATURE_EN_DIG1000X 0x4000 #endif /* _LINUX_BRCMPHY_H */ -- cgit v1.2.3-59-g8ed1b From 690e36e726d00d2528bc569809048adf61550d80 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 23 Aug 2014 12:13:41 -0700 Subject: net: Allow raw buffers to be passed into the flow dissector. Drivers, and perhaps other entities we have not yet considered, sometimes want to know how deep the protocol headers go before deciding how large of an SKB to allocate and how much of the packet to place into the linear SKB area. For example, consider a driver which has a device which DMAs into pools of pages and then tells the driver where the data went in the DMA descriptor(s). The driver can then build an SKB and reference most of the data via SKB fragments (which are page/offset/length triplets). However at least some of the front of the packet should be placed into the linear SKB area, which comes before the fragments, so that packet processing can get at the headers efficiently. The first thing each protocol layer is going to do is a "pskb_may_pull()" so we might as well aggregate as much of this as possible while we're building the SKB in the driver. Part of supporting this is that we don't have an SKB yet, so we want to be able to let the flow dissector operate on a raw buffer in order to compute the offset of the end of the headers. So now we have a __skb_flow_dissect() which takes an explicit data pointer and length. Signed-off-by: David S. Miller --- include/linux/skbuff.h | 18 ++++++++++++------ include/net/flow_keys.h | 14 ++++++++++++-- net/core/flow_dissector.c | 40 ++++++++++++++++++++++++++-------------- 3 files changed, 50 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index abde271c18ae..18ddf9684a27 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2567,20 +2567,26 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum); -static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, - int len, void *buffer) +static inline void *__skb_header_pointer(const struct sk_buff *skb, int offset, + int len, void *data, int hlen, void *buffer) { - int hlen = skb_headlen(skb); - if (hlen - offset >= len) - return skb->data + offset; + return data + offset; - if (skb_copy_bits(skb, offset, buffer, len) < 0) + if (!skb || + skb_copy_bits(skb, offset, buffer, len) < 0) return NULL; return buffer; } +static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, + int len, void *buffer) +{ + return __skb_header_pointer(skb, offset, len, skb->data, + skb_headlen(skb), buffer); +} + /** * skb_needs_linearize - check if we need to linearize a given skb * depending on the given device features. diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h index 6667a054763a..4040f63932c5 100644 --- a/include/net/flow_keys.h +++ b/include/net/flow_keys.h @@ -27,7 +27,17 @@ struct flow_keys { u8 ip_proto; }; -bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow); -__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto); +bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, + void *data, int hlen); +static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) +{ + return __skb_flow_dissect(skb, flow, NULL, 0); +} +__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, + void *data, int hlen_proto); +static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) +{ + return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0); +} u32 flow_hash_from_keys(struct flow_keys *keys); #endif diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 5f362c1d0332..660c6492fb78 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -34,29 +34,40 @@ static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *i * The function will try to retrieve the ports at offset thoff + poff where poff * is the protocol port offset returned from proto_ports_offset */ -__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) +__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, + void *data, int hlen) { int poff = proto_ports_offset(ip_proto); + if (!data) { + data = skb->data; + hlen = skb_headlen(skb); + } + if (poff >= 0) { __be32 *ports, _ports; - ports = skb_header_pointer(skb, thoff + poff, - sizeof(_ports), &_ports); + ports = __skb_header_pointer(skb, thoff + poff, + sizeof(_ports), data, hlen, &_ports); if (ports) return *ports; } return 0; } -EXPORT_SYMBOL(skb_flow_get_ports); +EXPORT_SYMBOL(__skb_flow_get_ports); -bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) +bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, void *data, int hlen) { int nhoff = skb_network_offset(skb); u8 ip_proto; __be16 proto = skb->protocol; + if (!data) { + data = skb->data; + hlen = skb_headlen(skb); + } + memset(flow, 0, sizeof(*flow)); again: @@ -65,7 +76,7 @@ again: const struct iphdr *iph; struct iphdr _iph; ip: - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph || iph->ihl < 5) return false; nhoff += iph->ihl * 4; @@ -83,7 +94,7 @@ ip: __be32 flow_label; ipv6: - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph) return false; @@ -113,7 +124,7 @@ ipv6: const struct vlan_hdr *vlan; struct vlan_hdr _vlan; - vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan); + vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); if (!vlan) return false; @@ -126,7 +137,7 @@ ipv6: struct pppoe_hdr hdr; __be16 proto; } *hdr, _hdr; - hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return false; proto = hdr->proto; @@ -151,7 +162,7 @@ ipv6: __be16 proto; } *hdr, _hdr; - hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return false; /* @@ -171,8 +182,9 @@ ipv6: const struct ethhdr *eth; struct ethhdr _eth; - eth = skb_header_pointer(skb, nhoff, - sizeof(_eth), &_eth); + eth = __skb_header_pointer(skb, nhoff, + sizeof(_eth), + data, hlen, &_eth); if (!eth) return false; proto = eth->h_proto; @@ -194,12 +206,12 @@ ipv6: flow->n_proto = proto; flow->ip_proto = ip_proto; - flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto); + flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen); flow->thoff = (u16) nhoff; return true; } -EXPORT_SYMBOL(skb_flow_dissect); +EXPORT_SYMBOL(__skb_flow_dissect); static u32 hashrnd __read_mostly; static __always_inline void __flow_hash_secret_init(void) -- cgit v1.2.3-59-g8ed1b From 1b05756c48ea07ced9604ef01d11194d936da163 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Tue, 5 Aug 2014 22:02:34 +0200 Subject: netfilter: ipset: Fix warn: integer overflows 'sizeof(*map) + size * set->dsize' Dan Carpenter reported that the static checker emits the warning net/netfilter/ipset/ip_set_list_set.c:600 init_list_set() warn: integer overflows 'sizeof(*map) + size * set->dsize' Limit the maximal number of elements in list type of sets. Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set_list.h | 1 + net/netfilter/ipset/ip_set_list_set.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/ipset/ip_set_list.h b/include/linux/netfilter/ipset/ip_set_list.h index 68c2aea897f5..fe2622a00151 100644 --- a/include/linux/netfilter/ipset/ip_set_list.h +++ b/include/linux/netfilter/ipset/ip_set_list.h @@ -6,5 +6,6 @@ #define IP_SET_LIST_DEFAULT_SIZE 8 #define IP_SET_LIST_MIN_SIZE 4 +#define IP_SET_LIST_MAX_SIZE 65536 #endif /* __IP_SET_LIST_H */ diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 3e2317f3cf68..f87adbad6076 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -597,7 +597,9 @@ init_list_set(struct net *net, struct ip_set *set, u32 size) struct set_elem *e; u32 i; - map = kzalloc(sizeof(*map) + size * set->dsize, GFP_KERNEL); + map = kzalloc(sizeof(*map) + + min_t(u32, size, IP_SET_LIST_MAX_SIZE) * set->dsize, + GFP_KERNEL); if (!map) return false; -- cgit v1.2.3-59-g8ed1b From 573e8fca255a27e3573b51f9b183d62641c47a3d Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 22 Aug 2014 13:33:47 -0700 Subject: net: skb_gro_checksum_* functions Add skb_gro_checksum_validate, skb_gro_checksum_validate_zero_check, and skb_gro_checksum_simple_validate, and __skb_gro_checksum_complete. These are the cognates of the normal checksum functions but are used in the gro_receive path and operate on GRO related fields in sk_buffs. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 76 +++++++++++++++++++++++++++++++++++++++++++++-- net/core/dev.c | 34 ++++++++++++++++++++- 2 files changed, 107 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7e2b0b8b5cd7..eb73444e1bd0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1883,7 +1883,13 @@ struct napi_gro_cb { u16 proto; /* Used in udp_gro_receive */ - u16 udp_mark; + u8 udp_mark:1; + + /* GRO checksum is valid */ + u8 csum_valid:1; + + /* Number encapsulation layers crossed */ + u8 encapsulation; /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; @@ -2154,11 +2160,77 @@ static inline void *skb_gro_network_header(struct sk_buff *skb) static inline void skb_gro_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { - if (skb->ip_summed == CHECKSUM_COMPLETE) + if (NAPI_GRO_CB(skb)->csum_valid) NAPI_GRO_CB(skb)->csum = csum_sub(NAPI_GRO_CB(skb)->csum, csum_partial(start, len, 0)); } +/* GRO checksum functions. These are logical equivalents of the normal + * checksum functions (in skbuff.h) except that they operate on the GRO + * offsets and fields in sk_buff. + */ + +__sum16 __skb_gro_checksum_complete(struct sk_buff *skb); + +static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb, + bool zero_okay, + __sum16 check) +{ + return (skb->ip_summed != CHECKSUM_PARTIAL && + (skb->ip_summed != CHECKSUM_UNNECESSARY || + (NAPI_GRO_CB(skb)->encapsulation > skb->encapsulation)) && + (!zero_okay || check)); +} + +static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb, + __wsum psum) +{ + if (NAPI_GRO_CB(skb)->csum_valid && + !csum_fold(csum_add(psum, NAPI_GRO_CB(skb)->csum))) + return 0; + + NAPI_GRO_CB(skb)->csum = psum; + + return __skb_gro_checksum_complete(skb); +} + +/* Update skb for CHECKSUM_UNNECESSARY when we verified a top level + * checksum or an encapsulated one during GRO. This saves work + * if we fallback to normal path with the packet. + */ +static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb) +{ + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + if (NAPI_GRO_CB(skb)->encapsulation) + skb->encapsulation = 1; + } else if (skb->ip_summed != CHECKSUM_PARTIAL) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->encapsulation = 0; + } +} + +#define __skb_gro_checksum_validate(skb, proto, zero_okay, check, \ + compute_pseudo) \ +({ \ + __sum16 __ret = 0; \ + if (__skb_gro_checksum_validate_needed(skb, zero_okay, check)) \ + __ret = __skb_gro_checksum_validate_complete(skb, \ + compute_pseudo(skb, proto)); \ + if (!__ret) \ + skb_gro_incr_csum_unnecessary(skb); \ + __ret; \ +}) + +#define skb_gro_checksum_validate(skb, proto, compute_pseudo) \ + __skb_gro_checksum_validate(skb, proto, false, 0, compute_pseudo) + +#define skb_gro_checksum_validate_zero_check(skb, proto, check, \ + compute_pseudo) \ + __skb_gro_checksum_validate(skb, proto, true, check, compute_pseudo) + +#define skb_gro_checksum_simple_validate(skb) \ + __skb_gro_checksum_validate(skb, 0, false, 0, null_compute_pseudo) + static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, diff --git a/net/core/dev.c b/net/core/dev.c index 1421dad4cb29..b6a718ec11c1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3962,7 +3962,13 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff goto normal; gro_list_prepare(napi, skb); - NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */ + + if (skb->ip_summed == CHECKSUM_COMPLETE) { + NAPI_GRO_CB(skb)->csum = skb->csum; + NAPI_GRO_CB(skb)->csum_valid = 1; + } else { + NAPI_GRO_CB(skb)->csum_valid = 0; + } rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { @@ -3975,6 +3981,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->flush = 0; NAPI_GRO_CB(skb)->free = 0; NAPI_GRO_CB(skb)->udp_mark = 0; + NAPI_GRO_CB(skb)->encapsulation = 0; pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); break; @@ -4205,6 +4212,31 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) } EXPORT_SYMBOL(napi_gro_frags); +/* Compute the checksum from gro_offset and return the folded value + * after adding in any pseudo checksum. + */ +__sum16 __skb_gro_checksum_complete(struct sk_buff *skb) +{ + __wsum wsum; + __sum16 sum; + + wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); + + /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ + sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev); + } + + NAPI_GRO_CB(skb)->csum = wsum; + NAPI_GRO_CB(skb)->csum_valid = 1; + + return sum; +} +EXPORT_SYMBOL(__skb_gro_checksum_complete); + /* * net_rps_action_and_irq_enable sends any pending IPI's for rps. * Note: called with local irq disabled, but exits with local irq enabled. -- cgit v1.2.3-59-g8ed1b From a98406e22c12e514bac28fec0a49dc793edaf3a8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 23 Aug 2014 17:03:28 +0200 Subject: random32: improvements to prandom_bytes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch addresses a couple of minor items, mostly addesssing prandom_bytes(): 1) prandom_bytes{,_state}() should use size_t for length arguments, 2) We can use put_unaligned() when filling the array instead of open coding it [ perhaps some archs will further benefit from their own arch specific implementation when GCC cannot make up for it ], 3) Fix a typo, 4) Better use unsigned int as type for getting the arch seed, 5) Make use of prandom_u32_max() for timer slack. Regarding the change to put_unaligned(), callers of prandom_bytes() which internally invoke prandom_bytes_state(), don't bother as they expect the array to be filled randomly and don't have any control of the internal state what-so-ever (that's also why we have periodic reseeding there, etc), so they really don't care. Now for the direct callers of prandom_bytes_state(), which are solely located in test cases for MTD devices, that is, drivers/mtd/tests/{oobtest.c,pagetest.c,subpagetest.c}: These tests basically fill a test write-vector through prandom_bytes_state() with an a-priori defined seed each time and write that to a MTD device. Later on, they set up a read-vector and read back that blocks from the device. So in the verification phase, the write-vector is being re-setup [ so same seed and prandom_bytes_state() called ], and then memcmp()'ed against the read-vector to check if the data is the same. Akinobu, Lothar and I also tested this patch and it runs through the 3 relevant MTD test cases w/o any errors on the nandsim device (simulator for MTD devs) for x86_64, ppc64, ARM (i.MX28, i.MX53 and i.MX6): # modprobe nandsim first_id_byte=0x20 second_id_byte=0xac \ third_id_byte=0x00 fourth_id_byte=0x15 # modprobe mtd_oobtest dev=0 # modprobe mtd_pagetest dev=0 # modprobe mtd_subpagetest dev=0 We also don't have any users depending directly on a particular result of the PRNG (except the PRNG self-test itself), and that's just fine as it e.g. allowed us easily to do things like upgrading from taus88 to taus113. Signed-off-by: Daniel Borkmann Tested-by: Akinobu Mita Tested-by: Lothar Waßmann Cc: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/linux/random.h | 4 ++-- lib/random32.c | 39 ++++++++++++++++++--------------------- 2 files changed, 20 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index 57fbbffd77a0..b05856e16b75 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -26,7 +26,7 @@ unsigned int get_random_int(void); unsigned long randomize_range(unsigned long start, unsigned long end, unsigned long len); u32 prandom_u32(void); -void prandom_bytes(void *buf, int nbytes); +void prandom_bytes(void *buf, size_t nbytes); void prandom_seed(u32 seed); void prandom_reseed_late(void); @@ -35,7 +35,7 @@ struct rnd_state { }; u32 prandom_u32_state(struct rnd_state *state); -void prandom_bytes_state(struct rnd_state *state, void *buf, int nbytes); +void prandom_bytes_state(struct rnd_state *state, void *buf, size_t nbytes); /** * prandom_u32_max - returns a pseudo-random number in interval [0, ep_ro) diff --git a/lib/random32.c b/lib/random32.c index c9b6bf3afe0c..0bee183fa18f 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -37,6 +37,7 @@ #include #include #include +#include #ifdef CONFIG_RANDOM32_SELFTEST static void __init prandom_state_selftest(void); @@ -96,27 +97,23 @@ EXPORT_SYMBOL(prandom_u32); * This is used for pseudo-randomness with no outside seeding. * For more random results, use prandom_bytes(). */ -void prandom_bytes_state(struct rnd_state *state, void *buf, int bytes) +void prandom_bytes_state(struct rnd_state *state, void *buf, size_t bytes) { - unsigned char *p = buf; - int i; - - for (i = 0; i < round_down(bytes, sizeof(u32)); i += sizeof(u32)) { - u32 random = prandom_u32_state(state); - int j; + u8 *ptr = buf; - for (j = 0; j < sizeof(u32); j++) { - p[i + j] = random; - random >>= BITS_PER_BYTE; - } + while (bytes >= sizeof(u32)) { + put_unaligned(prandom_u32_state(state), (u32 *) ptr); + ptr += sizeof(u32); + bytes -= sizeof(u32); } - if (i < bytes) { - u32 random = prandom_u32_state(state); - for (; i < bytes; i++) { - p[i] = random; - random >>= BITS_PER_BYTE; - } + if (bytes > 0) { + u32 rem = prandom_u32_state(state); + do { + *ptr++ = (u8) rem; + bytes--; + rem >>= BITS_PER_BYTE; + } while (bytes > 0); } } EXPORT_SYMBOL(prandom_bytes_state); @@ -126,7 +123,7 @@ EXPORT_SYMBOL(prandom_bytes_state); * @buf: where to copy the pseudo-random bytes to * @bytes: the requested number of bytes */ -void prandom_bytes(void *buf, int bytes) +void prandom_bytes(void *buf, size_t bytes) { struct rnd_state *state = &get_cpu_var(net_rand_state); @@ -137,7 +134,7 @@ EXPORT_SYMBOL(prandom_bytes); static void prandom_warmup(struct rnd_state *state) { - /* Calling RNG ten times to satify recurrence condition */ + /* Calling RNG ten times to satisfy recurrence condition */ prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); @@ -152,7 +149,7 @@ static void prandom_warmup(struct rnd_state *state) static u32 __extract_hwseed(void) { - u32 val = 0; + unsigned int val = 0; (void)(arch_get_random_seed_int(&val) || arch_get_random_int(&val)); @@ -228,7 +225,7 @@ static void __prandom_timer(unsigned long dontcare) prandom_seed(entropy); /* reseed every ~60 seconds, in [40 .. 80) interval with slack */ - expires = 40 + (prandom_u32() % 40); + expires = 40 + prandom_u32_max(40); seed_timer.expires = jiffies + msecs_to_jiffies(expires * MSEC_PER_SEC); add_timer(&seed_timer); -- cgit v1.2.3-59-g8ed1b From 4798248e4e023170e937a65a1d30fcc52496dd42 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 22 Aug 2014 16:21:53 -0700 Subject: net: Add ops->ndo_xmit_flush() Signed-off-by: David S. Miller --- drivers/net/wan/dlci.c | 2 +- drivers/usb/gadget/function/f_ncm.c | 2 +- include/linux/netdevice.h | 35 +++++++++++++++++++++++++++++++++++ net/atm/mpc.c | 2 +- net/core/dev.c | 5 ++--- net/core/netpoll.c | 3 +-- net/core/pktgen.c | 4 +--- net/packet/af_packet.c | 3 +-- net/sched/sch_teql.c | 3 +-- 9 files changed, 44 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/wan/dlci.c b/drivers/net/wan/dlci.c index 43c9960dce1c..81b22a180aad 100644 --- a/drivers/net/wan/dlci.c +++ b/drivers/net/wan/dlci.c @@ -193,7 +193,7 @@ static netdev_tx_t dlci_transmit(struct sk_buff *skb, struct net_device *dev) struct dlci_local *dlp = netdev_priv(dev); if (skb) - dlp->slave->netdev_ops->ndo_start_xmit(skb, dlp->slave); + netdev_start_xmit(skb, dlp->slave); return NETDEV_TX_OK; } diff --git a/drivers/usb/gadget/function/f_ncm.c b/drivers/usb/gadget/function/f_ncm.c index bcdc882cd415..cb5d646db6a7 100644 --- a/drivers/usb/gadget/function/f_ncm.c +++ b/drivers/usb/gadget/function/f_ncm.c @@ -1101,7 +1101,7 @@ static void ncm_tx_tasklet(unsigned long data) /* Only send if data is available. */ if (ncm->skb_tx_data) { ncm->timer_force_tx = true; - ncm->netdev->netdev_ops->ndo_start_xmit(NULL, ncm->netdev); + netdev_start_xmit(NULL, ncm->netdev); ncm->timer_force_tx = false; } } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eb73444e1bd0..220c50984688 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -782,6 +782,19 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * (can also return NETDEV_TX_LOCKED iff NETIF_F_LLTX) * Required can not be NULL. * + * void (*ndo_xmit_flush)(struct net_device *dev, u16 queue); + * A driver implements this function when it wishes to support + * deferred TX queue flushing. The idea is that the expensive + * operation to trigger TX queue processing can be done after + * N calls to ndo_start_xmit rather than being done every single + * time. In this regime ndo_start_xmit will be called one or more + * times, and then a final ndo_xmit_flush call will be made to + * have the driver tell the device about the new pending TX queue + * entries. The kernel keeps track of which queues need flushing + * by monitoring skb->queue_mapping of the packets it submits to + * ndo_start_xmit. This is the queue value that will be passed + * to ndo_xmit_flush. + * * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, * void *accel_priv, select_queue_fallback_t fallback); * Called to decide which queue to when device supports multiple @@ -1005,6 +1018,7 @@ struct net_device_ops { int (*ndo_stop)(struct net_device *dev); netdev_tx_t (*ndo_start_xmit) (struct sk_buff *skb, struct net_device *dev); + void (*ndo_xmit_flush)(struct net_device *dev, u16 queue); u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, void *accel_priv, @@ -3430,6 +3444,27 @@ int __init dev_proc_init(void); #define dev_proc_init() 0 #endif +static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, + struct sk_buff *skb, struct net_device *dev) +{ + netdev_tx_t ret; + u16 q; + + q = skb->queue_mapping; + ret = ops->ndo_start_xmit(skb, dev); + if (dev_xmit_complete(ret) && ops->ndo_xmit_flush) + ops->ndo_xmit_flush(dev, q); + + return ret; +} + +static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + return __netdev_start_xmit(ops, skb, dev); +} + int netdev_class_create_file_ns(struct class_attribute *class_attr, const void *ns); void netdev_class_remove_file_ns(struct class_attribute *class_attr, diff --git a/net/atm/mpc.c b/net/atm/mpc.c index e8e0e7a8a23d..d662da161e5a 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -599,7 +599,7 @@ static netdev_tx_t mpc_send_packet(struct sk_buff *skb, } non_ip: - return mpc->old_ops->ndo_start_xmit(skb, dev); + return __netdev_start_xmit(mpc->old_ops, skb, dev); } static int atm_mpoa_vcc_attach(struct atm_vcc *vcc, void __user *arg) diff --git a/net/core/dev.c b/net/core/dev.c index b6a718ec11c1..26d296c2447c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2602,7 +2602,6 @@ EXPORT_SYMBOL(netif_skb_features); int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { - const struct net_device_ops *ops = dev->netdev_ops; int rc = NETDEV_TX_OK; unsigned int skb_len; @@ -2667,7 +2666,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, skb_len = skb->len; trace_net_dev_start_xmit(skb, dev); - rc = ops->ndo_start_xmit(skb, dev); + rc = netdev_start_xmit(skb, dev); trace_net_dev_xmit(skb, rc, dev, skb_len); if (rc == NETDEV_TX_OK) txq_trans_update(txq); @@ -2686,7 +2685,7 @@ gso: skb_len = nskb->len; trace_net_dev_start_xmit(nskb, dev); - rc = ops->ndo_start_xmit(nskb, dev); + rc = netdev_start_xmit(nskb, dev); trace_net_dev_xmit(nskb, rc, dev, skb_len); if (unlikely(rc != NETDEV_TX_OK)) { if (rc & ~NETDEV_TX_MASK) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 907fb5e36c02..a5ad06828d67 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -72,7 +72,6 @@ module_param(carrier_timeout, uint, 0644); static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { - const struct net_device_ops *ops = dev->netdev_ops; int status = NETDEV_TX_OK; netdev_features_t features; @@ -92,7 +91,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, skb->vlan_tci = 0; } - status = ops->ndo_start_xmit(skb, dev); + status = netdev_start_xmit(skb, dev); if (status == NETDEV_TX_OK) txq_trans_update(txq); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 8b849ddfef2e..83e2b4b19eb7 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3285,8 +3285,6 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev) static void pktgen_xmit(struct pktgen_dev *pkt_dev) { struct net_device *odev = pkt_dev->odev; - netdev_tx_t (*xmit)(struct sk_buff *, struct net_device *) - = odev->netdev_ops->ndo_start_xmit; struct netdev_queue *txq; u16 queue_map; int ret; @@ -3339,7 +3337,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) goto unlock; } atomic_inc(&(pkt_dev->skb->users)); - ret = (*xmit)(pkt_dev->skb, odev); + ret = netdev_start_xmit(pkt_dev->skb, odev); switch (ret) { case NETDEV_TX_OK: diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 93896d2092f6..0dfa990d4eaa 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -240,7 +240,6 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po); static int packet_direct_xmit(struct sk_buff *skb) { struct net_device *dev = skb->dev; - const struct net_device_ops *ops = dev->netdev_ops; netdev_features_t features; struct netdev_queue *txq; int ret = NETDEV_TX_BUSY; @@ -262,7 +261,7 @@ static int packet_direct_xmit(struct sk_buff *skb) HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_drv_stopped(txq)) { - ret = ops->ndo_start_xmit(skb, dev); + ret = netdev_start_xmit(skb, dev); if (ret == NETDEV_TX_OK) txq_trans_update(txq); } diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index bd33793b527e..64cd93ca8104 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -301,7 +301,6 @@ restart: do { struct net_device *slave = qdisc_dev(q); struct netdev_queue *slave_txq = netdev_get_tx_queue(slave, 0); - const struct net_device_ops *slave_ops = slave->netdev_ops; if (slave_txq->qdisc_sleeping != q) continue; @@ -317,7 +316,7 @@ restart: unsigned int length = qdisc_pkt_len(skb); if (!netif_xmit_frozen_or_stopped(slave_txq) && - slave_ops->ndo_start_xmit(skb, slave) == NETDEV_TX_OK) { + netdev_start_xmit(skb, slave) == NETDEV_TX_OK) { txq_trans_update(slave_txq); __netif_tx_unlock(slave_txq); master->slaves = NEXT_SLAVE(q); -- cgit v1.2.3-59-g8ed1b From 48ea526a6877d605c961aa37fae33f3227b29424 Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Mon, 25 Aug 2014 16:06:53 +0300 Subject: net/mlx4: Use is_kdump_kernel() to detect kdump kernel Use is_kdump_kernel() to detect kdump kernel, instead of reset_devices. Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 071f6b234604..783dd099abd1 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -38,6 +38,7 @@ #include #include #include +#include #include @@ -1275,7 +1276,7 @@ int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr, /* Returns true if running in low memory profile (kdump kernel) */ static inline bool mlx4_low_memory_profile(void) { - return reset_devices; + return is_kdump_kernel(); } #endif /* MLX4_DEVICE_H */ -- cgit v1.2.3-59-g8ed1b From 0b725a2ca61bedc33a2a63d0451d528b268cf975 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 25 Aug 2014 15:51:53 -0700 Subject: net: Remove ndo_xmit_flush netdev operation, use signalling instead. As reported by Jesper Dangaard Brouer, for high packet rates the overhead of having another indirect call in the TX path is non-trivial. There is the indirect call itself, and then there is all of the reloading of the state to refetch the tail pointer value and then write the device register. Move to a more passive scheme, which requires very light modifications to the device drivers. The signal is a new skb->xmit_more value, if it is non-zero it means that more SKBs are pending to be transmitted on the same queue as the current SKB. And therefore, the driver may elide the tail pointer update. Right now skb->xmit_more is always zero. Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/igb/igb_main.c | 36 +++++++++++-------------------- drivers/net/virtio_net.c | 12 +++-------- include/linux/netdevice.h | 25 ++------------------- include/linux/skbuff.h | 2 ++ 4 files changed, 19 insertions(+), 56 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index b9c020a05fb8..89c29b40d61c 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -136,7 +136,6 @@ static void igb_update_phy_info(unsigned long); static void igb_watchdog(unsigned long); static void igb_watchdog_task(struct work_struct *); static netdev_tx_t igb_xmit_frame(struct sk_buff *skb, struct net_device *); -static void igb_xmit_flush(struct net_device *netdev, u16 queue); static struct rtnl_link_stats64 *igb_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats); static int igb_change_mtu(struct net_device *, int); @@ -2076,7 +2075,6 @@ static const struct net_device_ops igb_netdev_ops = { .ndo_open = igb_open, .ndo_stop = igb_close, .ndo_start_xmit = igb_xmit_frame, - .ndo_xmit_flush = igb_xmit_flush, .ndo_get_stats64 = igb_get_stats64, .ndo_set_rx_mode = igb_set_rx_mode, .ndo_set_mac_address = igb_set_mac, @@ -4917,6 +4915,14 @@ static void igb_tx_map(struct igb_ring *tx_ring, tx_ring->next_to_use = i; + if (!skb->xmit_more) { + writel(i, tx_ring->tail); + + /* we need this if more than one processor can write to our tail + * at a time, it synchronizes IO on IA64/Altix systems + */ + mmiowb(); + } return; dma_error: @@ -5052,20 +5058,17 @@ out_drop: return NETDEV_TX_OK; } -static struct igb_ring *__igb_tx_queue_mapping(struct igb_adapter *adapter, unsigned int r_idx) +static inline struct igb_ring *igb_tx_queue_mapping(struct igb_adapter *adapter, + struct sk_buff *skb) { + unsigned int r_idx = skb->queue_mapping; + if (r_idx >= adapter->num_tx_queues) r_idx = r_idx % adapter->num_tx_queues; return adapter->tx_ring[r_idx]; } -static inline struct igb_ring *igb_tx_queue_mapping(struct igb_adapter *adapter, - struct sk_buff *skb) -{ - return __igb_tx_queue_mapping(adapter, skb->queue_mapping); -} - static netdev_tx_t igb_xmit_frame(struct sk_buff *skb, struct net_device *netdev) { @@ -5094,21 +5097,6 @@ static netdev_tx_t igb_xmit_frame(struct sk_buff *skb, return igb_xmit_frame_ring(skb, igb_tx_queue_mapping(adapter, skb)); } -static void igb_xmit_flush(struct net_device *netdev, u16 queue) -{ - struct igb_adapter *adapter = netdev_priv(netdev); - struct igb_ring *tx_ring; - - tx_ring = __igb_tx_queue_mapping(adapter, queue); - - writel(tx_ring->next_to_use, tx_ring->tail); - - /* we need this if more than one processor can write to our tail - * at a time, it synchronizes IO on IA64/Altix systems - */ - mmiowb(); -} - /** * igb_tx_timeout - Respond to a Tx Hang * @netdev: network interface device structure diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 62421086d3e6..f0c2824f5e0f 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -953,15 +953,10 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) } } - return NETDEV_TX_OK; -} + if (!skb->xmit_more) + virtqueue_kick(sq->vq); -static void xmit_flush(struct net_device *dev, u16 qnum) -{ - struct virtnet_info *vi = netdev_priv(dev); - struct send_queue *sq = &vi->sq[qnum]; - - virtqueue_kick(sq->vq); + return NETDEV_TX_OK; } /* @@ -1393,7 +1388,6 @@ static const struct net_device_ops virtnet_netdev = { .ndo_open = virtnet_open, .ndo_stop = virtnet_close, .ndo_start_xmit = start_xmit, - .ndo_xmit_flush = xmit_flush, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = virtnet_set_mac_address, .ndo_set_rx_mode = virtnet_set_rx_mode, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 220c50984688..039b23786c22 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -782,19 +782,6 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * (can also return NETDEV_TX_LOCKED iff NETIF_F_LLTX) * Required can not be NULL. * - * void (*ndo_xmit_flush)(struct net_device *dev, u16 queue); - * A driver implements this function when it wishes to support - * deferred TX queue flushing. The idea is that the expensive - * operation to trigger TX queue processing can be done after - * N calls to ndo_start_xmit rather than being done every single - * time. In this regime ndo_start_xmit will be called one or more - * times, and then a final ndo_xmit_flush call will be made to - * have the driver tell the device about the new pending TX queue - * entries. The kernel keeps track of which queues need flushing - * by monitoring skb->queue_mapping of the packets it submits to - * ndo_start_xmit. This is the queue value that will be passed - * to ndo_xmit_flush. - * * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, * void *accel_priv, select_queue_fallback_t fallback); * Called to decide which queue to when device supports multiple @@ -1018,7 +1005,6 @@ struct net_device_ops { int (*ndo_stop)(struct net_device *dev); netdev_tx_t (*ndo_start_xmit) (struct sk_buff *skb, struct net_device *dev); - void (*ndo_xmit_flush)(struct net_device *dev, u16 queue); u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, void *accel_priv, @@ -3447,15 +3433,8 @@ int __init dev_proc_init(void); static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, struct sk_buff *skb, struct net_device *dev) { - netdev_tx_t ret; - u16 q; - - q = skb->queue_mapping; - ret = ops->ndo_start_xmit(skb, dev); - if (dev_xmit_complete(ret) && ops->ndo_xmit_flush) - ops->ndo_xmit_flush(dev, q); - - return ret; + skb->xmit_more = 0; + return ops->ndo_start_xmit(skb, dev); } static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 18ddf9684a27..9b3802a197a8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -452,6 +452,7 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1, * @tc_verd: traffic control verdict * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices + * @xmit_more: More SKBs are pending for this queue * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport @@ -558,6 +559,7 @@ struct sk_buff { __u16 queue_mapping; kmemcheck_bitfield_begin(flags2); + __u8 xmit_more:1; #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif -- cgit v1.2.3-59-g8ed1b From 170fd0b1f6108b48df4369afa0ee29a83e922748 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Wed, 30 Jul 2014 14:36:18 +0300 Subject: ieee80211: Support parsing TPC report element in action frames TPC report element is contained in spectrum management's tpc report action frames and in radio measurement's link measurement report action frames. Add a function which checks whether an action frame contains this element. This may be needed by the drivers in order to set the correct tx power value in these frames. Signed-off-by: Andrei Otcheretianski Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 65 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 63ab3873c5ed..8018c915ee63 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -838,6 +838,16 @@ enum ieee80211_vht_opmode_bits { #define WLAN_SA_QUERY_TR_ID_LEN 2 +/** + * struct ieee80211_tpc_report_ie + * + * This structure refers to "TPC Report element" + */ +struct ieee80211_tpc_report_ie { + u8 tx_power; + u8 link_margin; +} __packed; + struct ieee80211_mgmt { __le16 frame_control; __le16 duration; @@ -973,6 +983,13 @@ struct ieee80211_mgmt { u8 action_code; u8 operating_mode; } __packed vht_opmode_notif; + struct { + u8 action_code; + u8 dialog_token; + u8 tpc_elem_id; + u8 tpc_elem_length; + struct ieee80211_tpc_report_ie tpc; + } __packed tpc_report; } u; } __packed action; } u; @@ -1865,6 +1882,7 @@ enum ieee80211_category { WLAN_CATEGORY_DLS = 2, WLAN_CATEGORY_BACK = 3, WLAN_CATEGORY_PUBLIC = 4, + WLAN_CATEGORY_RADIO_MEASUREMENT = 5, WLAN_CATEGORY_HT = 7, WLAN_CATEGORY_SA_QUERY = 8, WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION = 9, @@ -2378,4 +2396,51 @@ static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim, #define TU_TO_JIFFIES(x) (usecs_to_jiffies((x) * 1024)) #define TU_TO_EXP_TIME(x) (jiffies + TU_TO_JIFFIES(x)) +/** + * ieee80211_action_contains_tpc - checks if the frame contains TPC element + * @skb: the skb containing the frame, length will be checked + * + * This function checks if it's either TPC report action frame or Link + * Measurement report action frame as defined in IEEE Std. 802.11-2012 8.5.2.5 + * and 8.5.7.5 accordingly. + */ +static inline bool ieee80211_action_contains_tpc(struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (void *)skb->data; + + if (!ieee80211_is_action(mgmt->frame_control)) + return false; + + if (skb->len < IEEE80211_MIN_ACTION_SIZE + + sizeof(mgmt->u.action.u.tpc_report)) + return false; + + /* + * TPC report - check that: + * category = 0 (Spectrum Management) or 5 (Radio Measurement) + * spectrum management action = 3 (TPC/Link Measurement report) + * TPC report EID = 35 + * TPC report element length = 2 + * + * The spectrum management's tpc_report struct is used here both for + * parsing tpc_report and radio measurement's link measurement report + * frame, since the relevant part is identical in both frames. + */ + if (mgmt->u.action.category != WLAN_CATEGORY_SPECTRUM_MGMT && + mgmt->u.action.category != WLAN_CATEGORY_RADIO_MEASUREMENT) + return false; + + /* both spectrum mgmt and link measurement have same action code */ + if (mgmt->u.action.u.tpc_report.action_code != + WLAN_ACTION_SPCT_TPC_RPRT) + return false; + + if (mgmt->u.action.u.tpc_report.tpc_elem_id != WLAN_EID_TPC_REPORT || + mgmt->u.action.u.tpc_report.tpc_elem_length != + sizeof(struct ieee80211_tpc_report_ie)) + return false; + + return true; +} + #endif /* LINUX_IEEE80211_H */ -- cgit v1.2.3-59-g8ed1b From 5c21403d74af2c9cd635a34c2f9199681a5b813e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 27 Aug 2014 14:39:04 -0700 Subject: net: Update sk_buff flag bit availability comment. We lost one when xmit_more was added. Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9b3802a197a8..b69b7b512c06 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -580,7 +580,7 @@ struct sk_buff { __u8 encap_hdr_csum:1; __u8 csum_valid:1; __u8 csum_complete_sw:1; - /* 2/4 bit hole (depending on ndisc_nodetype presence) */ + /* 1/3 bit hole (depending on ndisc_nodetype presence) */ kmemcheck_bitfield_end(flags2); #if defined CONFIG_NET_DMA || defined CONFIG_NET_RX_BUSY_POLL -- cgit v1.2.3-59-g8ed1b From 3e8a72d1dae374cf6fc1dba97cec663585845ff9 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Aug 2014 17:04:46 -0700 Subject: net: dsa: reduce number of protocol hooks DSA is currently registering one packet_type function per EtherType it needs to intercept in the receive path of a DSA-enabled Ethernet device. Right now we have three of them: trailer, DSA and eDSA, and there might be more in the future, this will not scale to the addition of new protocols. This patch proceeds with adding a new layer of abstraction and two new functions: dsa_switch_rcv() which will dispatch into the tag-protocol specific receive function implemented by net/dsa/tag_*.c dsa_slave_xmit() which will dispatch into the tag-protocol specific transmit function implemented by net/dsa/tag_*.c When we do create the per-port slave network devices, we iterate over the switch protocol to assign the DSA-specific receive and transmit operations. A new fake ethertype value is used: ETH_P_XDSA to illustrate the fact that this is no longer going to look like ETH_P_DSA or ETH_P_TRAILER like it used to be. This allows us to greatly simplify the check in eth_type_trans() and always override the skb->protocol with ETH_P_XDSA for Ethernet switches tagged protocol, while also reducing the number repetitive slave netdevice_ops assignments. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/netdevice.h | 28 ++++++++++++--------------- include/net/dsa.h | 20 +++---------------- include/uapi/linux/if_ether.h | 1 + net/dsa/dsa.c | 39 ++++++++++++++++++++----------------- net/dsa/dsa_priv.h | 9 +++------ net/dsa/slave.c | 45 ++++++++++++++----------------------------- net/dsa/tag_dsa.c | 8 ++++---- net/dsa/tag_edsa.c | 8 ++++---- net/dsa/tag_trailer.c | 8 ++++---- net/ethernet/eth.c | 7 ++----- 10 files changed, 68 insertions(+), 105 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 039b23786c22..1875dc71422a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1781,24 +1781,13 @@ void dev_net_set(struct net_device *dev, struct net *net) #endif } -static inline bool netdev_uses_dsa_tags(struct net_device *dev) +static inline bool netdev_uses_dsa(struct net_device *dev) { -#ifdef CONFIG_NET_DSA_TAG_DSA - if (dev->dsa_ptr != NULL) - return dsa_uses_dsa_tags(dev->dsa_ptr); -#endif - - return 0; -} - -static inline bool netdev_uses_trailer_tags(struct net_device *dev) -{ -#ifdef CONFIG_NET_DSA_TAG_TRAILER - if (dev->dsa_ptr != NULL) - return dsa_uses_trailer_tags(dev->dsa_ptr); +#ifdef CONFIG_NET_DSA + return dev->dsa_ptr != NULL; +#else + return false; #endif - - return 0; } /** @@ -1933,6 +1922,13 @@ struct udp_offload { struct offload_callbacks callbacks; }; +struct dsa_device_ops { + netdev_tx_t (*xmit)(struct sk_buff *skb, struct net_device *dev); + int (*rcv)(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev); +}; + + /* often modified stats are per cpu, other are shared (netdev->stats) */ struct pcpu_sw_netstats { u64 rx_packets; diff --git a/include/net/dsa.h b/include/net/dsa.h index 6efce384451e..6e26f1e4d8ce 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -59,6 +59,8 @@ struct dsa_platform_data { struct dsa_chip_data *chip; }; +struct dsa_device_ops; + struct dsa_switch_tree { /* * Configuration data for the platform device that owns @@ -71,6 +73,7 @@ struct dsa_switch_tree { * protocol to use. */ struct net_device *master_netdev; + const struct dsa_device_ops *ops; __be16 tag_protocol; /* @@ -186,21 +189,4 @@ static inline void *ds_to_priv(struct dsa_switch *ds) return (void *)(ds + 1); } -/* - * The original DSA tag format and some other tag formats have no - * ethertype, which means that we need to add a little hack to the - * networking receive path to make sure that received frames get - * the right ->protocol assigned to them when one of those tag - * formats is in use. - */ -static inline bool dsa_uses_dsa_tags(struct dsa_switch_tree *dst) -{ - return !!(dst->tag_protocol == htons(ETH_P_DSA)); -} - -static inline bool dsa_uses_trailer_tags(struct dsa_switch_tree *dst) -{ - return !!(dst->tag_protocol == htons(ETH_P_TRAILER)); -} - #endif diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 0f8210b8e0bc..aa63ed023c2b 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -128,6 +128,7 @@ #define ETH_P_PHONET 0x00F5 /* Nokia Phonet frames */ #define ETH_P_IEEE802154 0x00F6 /* IEEE802.15.4 frame */ #define ETH_P_CAIF 0x00F7 /* ST-Ericsson CAIF protocol */ +#define ETH_P_XDSA 0x00F8 /* Multiplexed DSA protocol */ /* * This is an Ethernet frame header. diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 0a49632fac47..92e71d2a2ccd 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -608,6 +608,24 @@ static void dsa_shutdown(struct platform_device *pdev) { } +static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + + if (unlikely(dst == NULL)) { + kfree_skb(skb); + return 0; + } + + return dst->ops->rcv(skb, dev, pt, orig_dev); +} + +struct packet_type dsa_pack_type __read_mostly = { + .type = cpu_to_be16(ETH_P_XDSA), + .func = dsa_switch_rcv, +}; + static const struct of_device_id dsa_of_match_table[] = { { .compatible = "marvell,dsa", }, {} @@ -633,30 +651,15 @@ static int __init dsa_init_module(void) if (rc) return rc; -#ifdef CONFIG_NET_DSA_TAG_DSA - dev_add_pack(&dsa_packet_type); -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA - dev_add_pack(&edsa_packet_type); -#endif -#ifdef CONFIG_NET_DSA_TAG_TRAILER - dev_add_pack(&trailer_packet_type); -#endif + dev_add_pack(&dsa_pack_type); + return 0; } module_init(dsa_init_module); static void __exit dsa_cleanup_module(void) { -#ifdef CONFIG_NET_DSA_TAG_TRAILER - dev_remove_pack(&trailer_packet_type); -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA - dev_remove_pack(&edsa_packet_type); -#endif -#ifdef CONFIG_NET_DSA_TAG_DSA - dev_remove_pack(&dsa_packet_type); -#endif + dev_remove_pack(&dsa_pack_type); platform_driver_unregister(&dsa_driver); } module_exit(dsa_cleanup_module); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index d4cf5cc747e3..218d75d16f6f 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -45,16 +45,13 @@ struct net_device *dsa_slave_create(struct dsa_switch *ds, int port, char *name); /* tag_dsa.c */ -netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev); -extern struct packet_type dsa_packet_type; +extern const struct dsa_device_ops dsa_netdev_ops; /* tag_edsa.c */ -netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev); -extern struct packet_type edsa_packet_type; +extern const struct dsa_device_ops edsa_netdev_ops; /* tag_trailer.c */ -netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev); -extern struct packet_type trailer_packet_type; +extern const struct dsa_device_ops trailer_netdev_ops; #endif diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 45a1e34c89e0..ad1a913533aa 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -171,6 +171,14 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EOPNOTSUPP; } +static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch_tree *dst = p->parent->dst; + + return dst->ops->xmit(skb, dev); +} + /* ethtool operations *******************************************************/ static int @@ -293,42 +301,16 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_sset_count = dsa_slave_get_sset_count, }; -#ifdef CONFIG_NET_DSA_TAG_DSA -static const struct net_device_ops dsa_netdev_ops = { +static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_init = dsa_slave_init, .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, - .ndo_start_xmit = dsa_xmit, + .ndo_start_xmit = dsa_slave_xmit, .ndo_change_rx_flags = dsa_slave_change_rx_flags, .ndo_set_rx_mode = dsa_slave_set_rx_mode, .ndo_set_mac_address = dsa_slave_set_mac_address, .ndo_do_ioctl = dsa_slave_ioctl, }; -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA -static const struct net_device_ops edsa_netdev_ops = { - .ndo_init = dsa_slave_init, - .ndo_open = dsa_slave_open, - .ndo_stop = dsa_slave_close, - .ndo_start_xmit = edsa_xmit, - .ndo_change_rx_flags = dsa_slave_change_rx_flags, - .ndo_set_rx_mode = dsa_slave_set_rx_mode, - .ndo_set_mac_address = dsa_slave_set_mac_address, - .ndo_do_ioctl = dsa_slave_ioctl, -}; -#endif -#ifdef CONFIG_NET_DSA_TAG_TRAILER -static const struct net_device_ops trailer_netdev_ops = { - .ndo_init = dsa_slave_init, - .ndo_open = dsa_slave_open, - .ndo_stop = dsa_slave_close, - .ndo_start_xmit = trailer_xmit, - .ndo_change_rx_flags = dsa_slave_change_rx_flags, - .ndo_set_rx_mode = dsa_slave_set_rx_mode, - .ndo_set_mac_address = dsa_slave_set_mac_address, - .ndo_do_ioctl = dsa_slave_ioctl, -}; -#endif /* slave device setup *******************************************************/ struct net_device * @@ -349,21 +331,22 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; eth_hw_addr_inherit(slave_dev, master); slave_dev->tx_queue_len = 0; + slave_dev->netdev_ops = &dsa_slave_netdev_ops; switch (ds->dst->tag_protocol) { #ifdef CONFIG_NET_DSA_TAG_DSA case htons(ETH_P_DSA): - slave_dev->netdev_ops = &dsa_netdev_ops; + ds->dst->ops = &dsa_netdev_ops; break; #endif #ifdef CONFIG_NET_DSA_TAG_EDSA case htons(ETH_P_EDSA): - slave_dev->netdev_ops = &edsa_netdev_ops; + ds->dst->ops = &edsa_netdev_ops; break; #endif #ifdef CONFIG_NET_DSA_TAG_TRAILER case htons(ETH_P_TRAILER): - slave_dev->netdev_ops = &trailer_netdev_ops; + ds->dst->ops = &trailer_netdev_ops; break; #endif default: diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index cacce1e22f9c..d7dbc5bda5c0 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -16,7 +16,7 @@ #define DSA_HLEN 4 -netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); u8 *dsa_header; @@ -186,7 +186,7 @@ out: return 0; } -struct packet_type dsa_packet_type __read_mostly = { - .type = cpu_to_be16(ETH_P_DSA), - .func = dsa_rcv, +const struct dsa_device_ops dsa_netdev_ops = { + .xmit = dsa_xmit, + .rcv = dsa_rcv, }; diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index e70c43c25e64..6b30abe89183 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -17,7 +17,7 @@ #define DSA_HLEN 4 #define EDSA_HLEN 8 -netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); u8 *edsa_header; @@ -205,7 +205,7 @@ out: return 0; } -struct packet_type edsa_packet_type __read_mostly = { - .type = cpu_to_be16(ETH_P_EDSA), - .func = edsa_rcv, +const struct dsa_device_ops edsa_netdev_ops = { + .xmit = edsa_xmit, + .rcv = edsa_rcv, }; diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 94bc260d015d..5fe9444842c5 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -14,7 +14,7 @@ #include #include "dsa_priv.h" -netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct sk_buff *nskb; @@ -114,7 +114,7 @@ out: return 0; } -struct packet_type trailer_packet_type __read_mostly = { - .type = cpu_to_be16(ETH_P_TRAILER), - .func = trailer_rcv, +const struct dsa_device_ops trailer_netdev_ops = { + .xmit = trailer_xmit, + .rcv = trailer_rcv, }; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index f405e0592407..5cebca134585 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -181,11 +181,8 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) * variants has been configured on the receiving interface, * and if so, set skb->protocol without looking at the packet. */ - if (unlikely(netdev_uses_dsa_tags(dev))) - return htons(ETH_P_DSA); - - if (unlikely(netdev_uses_trailer_tags(dev))) - return htons(ETH_P_TRAILER); + if (unlikely(netdev_uses_dsa(dev))) + return htons(ETH_P_XDSA); if (likely(ntohs(eth->h_proto) >= ETH_P_802_3_MIN)) return eth->h_proto; -- cgit v1.2.3-59-g8ed1b From 464c3668f065baeacfffa9d421959d21069389fe Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Aug 2014 17:04:48 -0700 Subject: net: phy: provide stub for fixed_phy_set_link_update In preparation for updating the DSA code and avoid using ifdefs there, provide an empty stub for fixed_phy_set_link_update when CONFIG_FIXED_PHY is not selected. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy_fixed.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index ae612acebb53..941138664c1d 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -18,6 +18,9 @@ extern int fixed_phy_register(unsigned int irq, struct fixed_phy_status *status, struct device_node *np); extern void fixed_phy_del(int phy_addr); +extern int fixed_phy_set_link_update(struct phy_device *phydev, + int (*link_update)(struct net_device *, + struct fixed_phy_status *)); #else static inline int fixed_phy_add(unsigned int irq, int phy_id, struct fixed_phy_status *status) @@ -34,14 +37,12 @@ static inline int fixed_phy_del(int phy_addr) { return -ENODEV; } -#endif /* CONFIG_FIXED_PHY */ - -/* - * This function issued only by fixed_phy-aware drivers, no need - * protect it with #ifdef - */ -extern int fixed_phy_set_link_update(struct phy_device *phydev, +static inline int fixed_phy_set_link_update(struct phy_device *phydev, int (*link_update)(struct net_device *, - struct fixed_phy_status *)); + struct fixed_phy_status *)) +{ + return -ENODEV; +} +#endif /* CONFIG_FIXED_PHY */ #endif /* __PHY_FIXED_H */ -- cgit v1.2.3-59-g8ed1b From 5aed85cec29882d1c4b4b2a01cb75a99efdbe4ed Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Aug 2014 17:04:52 -0700 Subject: net: dsa: allow switches to work without tagging In case switch port tagging is disabled (voluntarily, or the switch just does not support it), allow us to continue using the defined set of dsa_device_ops in net/dsa/slave.c. We introduce dsa_protocol_is_tagged() to check whether we need to override skb->protocol and go through the DSA-specifif packet_type function, or if we just go on and receive the SKB through the normal path. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 +++--- include/net/dsa.h | 5 +++++ net/dsa/slave.c | 19 ++++++++++++++++++- 3 files changed, 26 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1875dc71422a..429801370d0c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1784,10 +1784,10 @@ void dev_net_set(struct net_device *dev, struct net *net) static inline bool netdev_uses_dsa(struct net_device *dev) { #ifdef CONFIG_NET_DSA - return dev->dsa_ptr != NULL; -#else - return false; + if (dev->dsa_ptr != NULL) + return dsa_uses_tagged_protocol(dev->dsa_ptr); #endif + return false; } /** diff --git a/include/net/dsa.h b/include/net/dsa.h index dc357454ae3b..1035f6452d79 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -198,4 +198,9 @@ static inline void *ds_to_priv(struct dsa_switch *ds) return (void *)(ds + 1); } +static inline bool dsa_uses_tagged_protocol(struct dsa_switch_tree *dst) +{ + return dst->tag_protocol != 0; +} + #endif diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 03d2894a0f8a..241c2a1684cb 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -181,6 +181,17 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) return dst->ops->xmit(skb, dev); } +static netdev_tx_t dsa_slave_notag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + + skb->dev = p->parent->dst->master_netdev; + dev_queue_xmit(skb); + + return NETDEV_TX_OK; +} + /* ethtool operations *******************************************************/ static int @@ -314,6 +325,11 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_do_ioctl = dsa_slave_ioctl, }; +static const struct dsa_device_ops notag_netdev_ops = { + .xmit = dsa_slave_notag_xmit, + .rcv = NULL, +}; + static void dsa_slave_adjust_link(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -415,7 +431,8 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, break; #endif default: - BUG(); + ds->dst->ops = ¬ag_netdev_ops; + break; } SET_NETDEV_DEV(slave_dev, parent); -- cgit v1.2.3-59-g8ed1b From 97fdaab4699de3a2a91001efef60bb0622de1c53 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 26 Aug 2014 13:15:25 -0700 Subject: net: phy: broadcom: fix PHY_BCM_OUI_4 PHY_BCM_OUI_4 is missing two significant digits that actually make it an OUI, add those missing bits so it becomes usable again for matching. Fixes: b560a58c45c6 ("net: phy: add Broadcom BCM7xxx internal PHY driver") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index ee1431d976fa..921f17ca4c26 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -21,7 +21,7 @@ #define PHY_BCM_OUI_1 0x00206000 #define PHY_BCM_OUI_2 0x0143bc00 #define PHY_BCM_OUI_3 0x03625c00 -#define PHY_BCM_OUI_4 0x600d0000 +#define PHY_BCM_OUI_4 0x600d8400 #define PHY_BCM_OUI_5 0x03625e00 -- cgit v1.2.3-59-g8ed1b From 11bf2bbd596add62a86a74fc7aedc0b86c6ec154 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 26 Aug 2014 13:15:26 -0700 Subject: net: phy: broadcom: add new Broadcom OUI Broadcom started to use a new OUI for its 2013 and newer products: D4-01-29 which translates into 0xae025000 for a 32-bits OUI, add its definition. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 921f17ca4c26..cbcfad36d4c0 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -23,7 +23,7 @@ #define PHY_BCM_OUI_3 0x03625c00 #define PHY_BCM_OUI_4 0x600d8400 #define PHY_BCM_OUI_5 0x03625e00 - +#define PHY_BCM_OUI_6 0xae025000 #define PHY_BCM_FLAGS_MODE_COPPER 0x00000001 #define PHY_BCM_FLAGS_MODE_1000BX 0x00000002 -- cgit v1.2.3-59-g8ed1b From 430ad68ffb5fa632a277162e5995cd6f7a39fb78 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 26 Aug 2014 13:15:27 -0700 Subject: net: phy: bcm7xxx: add BCM7250 and BCM7364 PHY entries Add two new entries to the Broadcom BCM7xxx internal PHY driver for BCM7250 and BCM7364 chips. Those chips share the usual 28nm process Gigabit PHY sequence and require the same workarounds so far. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/bcm7xxx.c | 4 ++++ include/linux/brcmphy.h | 2 ++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c index 948c7086679a..09dd6e1dc6e1 100644 --- a/drivers/net/phy/bcm7xxx.c +++ b/drivers/net/phy/bcm7xxx.c @@ -335,6 +335,8 @@ static int bcm7xxx_dummy_config_init(struct phy_device *phydev) } static struct phy_driver bcm7xxx_driver[] = { + BCM7XXX_28NM_GPHY(PHY_ID_BCM7250, "Broadcom BCM7250"), + BCM7XXX_28NM_GPHY(PHY_ID_BCM7364, "Broadcom BCM7364"), BCM7XXX_28NM_GPHY(PHY_ID_BCM7366, "Broadcom BCM7366"), BCM7XXX_28NM_GPHY(PHY_ID_BCM7439, "Broadcom BCM7439"), BCM7XXX_28NM_GPHY(PHY_ID_BCM7445, "Broadcom BCM7445"), @@ -367,6 +369,8 @@ static struct phy_driver bcm7xxx_driver[] = { } }; static struct mdio_device_id __maybe_unused bcm7xxx_tbl[] = { + { PHY_ID_BCM7250, 0xfffffff0, }, + { PHY_ID_BCM7364, 0xfffffff0, }, { PHY_ID_BCM7366, 0xfffffff0, }, { PHY_ID_BCM7439, 0xfffffff0, }, { PHY_ID_BCM7445, 0xfffffff0, }, diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index cbcfad36d4c0..5bd35cc0d471 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -13,6 +13,8 @@ #define PHY_ID_BCM5461 0x002060c0 #define PHY_ID_BCM57780 0x03625d90 +#define PHY_ID_BCM7250 0xae025280 +#define PHY_ID_BCM7364 0xae025260 #define PHY_ID_BCM7366 0x600d8490 #define PHY_ID_BCM7439 0x600d8480 #define PHY_ID_BCM7445 0x600d8510 -- cgit v1.2.3-59-g8ed1b From 10c51b56232d24f150e39884a9e749fd99cbc60c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 27 Aug 2014 11:11:27 +0200 Subject: net: add skb_get_tx_queue() helper Replace occurences of skb_get_queue_mapping() and follow-up netdev_get_tx_queue() with an actual helper function. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++++ net/core/netpoll.c | 2 +- net/core/pktgen.c | 4 +--- net/packet/af_packet.c | 4 +--- net/sched/sch_generic.c | 6 ++++-- 5 files changed, 13 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 429801370d0c..dfc1d8b8bd0f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1747,6 +1747,12 @@ struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev, return &dev->_tx[index]; } +static inline struct netdev_queue *skb_get_tx_queue(const struct net_device *dev, + const struct sk_buff *skb) +{ + return netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); +} + static inline void netdev_for_each_tx_queue(struct net_device *dev, void (*f)(struct net_device *, struct netdev_queue *, diff --git a/net/core/netpoll.c b/net/core/netpoll.c index a5ad06828d67..12b1df976562 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -115,7 +115,7 @@ static void queue_process(struct work_struct *work) continue; } - txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + txq = skb_get_tx_queue(dev, skb); local_irq_save(flags); HARD_TX_LOCK(dev, txq, smp_processor_id()); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 83e2b4b19eb7..d81b540096c3 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3286,7 +3286,6 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) { struct net_device *odev = pkt_dev->odev; struct netdev_queue *txq; - u16 queue_map; int ret; /* If device is offline, then don't send */ @@ -3324,8 +3323,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) if (pkt_dev->delay && pkt_dev->last_ok) spin(pkt_dev, pkt_dev->next_tx); - queue_map = skb_get_queue_mapping(pkt_dev->skb); - txq = netdev_get_tx_queue(odev, queue_map); + txq = skb_get_tx_queue(odev, pkt_dev->skb); local_bh_disable(); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 0dfa990d4eaa..b7a7f5a721bd 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -243,7 +243,6 @@ static int packet_direct_xmit(struct sk_buff *skb) netdev_features_t features; struct netdev_queue *txq; int ret = NETDEV_TX_BUSY; - u16 queue_map; if (unlikely(!netif_running(dev) || !netif_carrier_ok(dev))) @@ -254,8 +253,7 @@ static int packet_direct_xmit(struct sk_buff *skb) __skb_linearize(skb)) goto drop; - queue_map = skb_get_queue_mapping(skb); - txq = netdev_get_tx_queue(dev, queue_map); + txq = skb_get_tx_queue(dev, skb); local_bh_disable(); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index fc04fe93c2da..05b3f5d104af 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -63,7 +63,7 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q) if (unlikely(skb)) { /* check the reason of requeuing without tx lock first */ - txq = netdev_get_tx_queue(txq->dev, skb_get_queue_mapping(skb)); + txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { q->gso_skb = NULL; q->q.qlen--; @@ -183,10 +183,12 @@ static inline int qdisc_restart(struct Qdisc *q) skb = dequeue_skb(q); if (unlikely(!skb)) return 0; + WARN_ON_ONCE(skb_dst_is_noref(skb)); + root_lock = qdisc_lock(q); dev = qdisc_dev(q); - txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + txq = skb_get_tx_queue(dev, skb); return sch_direct_xmit(skb, q, dev, txq, root_lock); } -- cgit v1.2.3-59-g8ed1b From 18fe8db5f2b53e4ac67b47048f24f50c57a2a759 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 28 Aug 2014 13:44:31 +0200 Subject: include/linux/cycx_x25.h: Remove unused header The header file include/linux/cycx_x25.h does not seem to be used anywhere. It was orphaned by 6fcdf4facb "wanrouter: delete now orphaned header content, files/drivers". Remove it. Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- include/linux/cycx_x25.h | 125 ----------------------------------------------- 1 file changed, 125 deletions(-) delete mode 100644 include/linux/cycx_x25.h (limited to 'include/linux') diff --git a/include/linux/cycx_x25.h b/include/linux/cycx_x25.h deleted file mode 100644 index 362bf19d6cf1..000000000000 --- a/include/linux/cycx_x25.h +++ /dev/null @@ -1,125 +0,0 @@ -#ifndef _CYCX_X25_H -#define _CYCX_X25_H -/* -* cycx_x25.h Cyclom X.25 firmware API definitions. -* -* Author: Arnaldo Carvalho de Melo -* -* Copyright: (c) 1998-2003 Arnaldo Carvalho de Melo -* -* Based on sdla_x25.h by Gene Kozin <74604.152@compuserve.com> -* -* This program is free software; you can redistribute it and/or -* modify it under the terms of the GNU General Public License -* as published by the Free Software Foundation; either version -* 2 of the License, or (at your option) any later version. -* ============================================================================ -* 2000/04/02 acme dprintk and cycx_debug -* 1999/01/03 acme judicious use of data types -* 1999/01/02 acme #define X25_ACK_N3 0x4411 -* 1998/12/28 acme cleanup: lot'o'things removed -* commands listed, -* TX25Cmd & TX25Config structs -* typedef'ed -*/ -#ifndef PACKED -#define PACKED __attribute__((packed)) -#endif - -/* X.25 shared memory layout. */ -#define X25_MBOX_OFFS 0x300 /* general mailbox block */ -#define X25_RXMBOX_OFFS 0x340 /* receive mailbox */ - -/* Debug */ -#define dprintk(level, format, a...) if (cycx_debug >= level) printk(format, ##a) - -extern unsigned int cycx_debug; - -/* Data Structures */ -/* X.25 Command Block. */ -struct cycx_x25_cmd { - u16 command; - u16 link; /* values: 0 or 1 */ - u16 len; /* values: 0 thru 0x205 (517) */ - u32 buf; -} PACKED; - -/* Defines for the 'command' field. */ -#define X25_CONNECT_REQUEST 0x4401 -#define X25_CONNECT_RESPONSE 0x4402 -#define X25_DISCONNECT_REQUEST 0x4403 -#define X25_DISCONNECT_RESPONSE 0x4404 -#define X25_DATA_REQUEST 0x4405 -#define X25_ACK_TO_VC 0x4406 -#define X25_INTERRUPT_RESPONSE 0x4407 -#define X25_CONFIG 0x4408 -#define X25_CONNECT_INDICATION 0x4409 -#define X25_CONNECT_CONFIRM 0x440A -#define X25_DISCONNECT_INDICATION 0x440B -#define X25_DISCONNECT_CONFIRM 0x440C -#define X25_DATA_INDICATION 0x440E -#define X25_INTERRUPT_INDICATION 0x440F -#define X25_ACK_FROM_VC 0x4410 -#define X25_ACK_N3 0x4411 -#define X25_CONNECT_COLLISION 0x4413 -#define X25_N3WIN 0x4414 -#define X25_LINE_ON 0x4415 -#define X25_LINE_OFF 0x4416 -#define X25_RESET_REQUEST 0x4417 -#define X25_LOG 0x4500 -#define X25_STATISTIC 0x4600 -#define X25_TRACE 0x4700 -#define X25_N2TRACEXC 0x4702 -#define X25_N3TRACEXC 0x4703 - -/** - * struct cycx_x25_config - cyclom2x x25 firmware configuration - * @link - link number - * @speed - line speed - * @clock - internal/external - * @n2 - # of level 2 retransm.(values: 1 thru FF) - * @n2win - level 2 window (values: 1 thru 7) - * @n3win - level 3 window (values: 1 thru 7) - * @nvc - # of logical channels (values: 1 thru 64) - * @pktlen - level 3 packet length - log base 2 of size - * @locaddr - my address - * @remaddr - remote address - * @t1 - time, in seconds - * @t2 - time, in seconds - * @t21 - time, in seconds - * @npvc - # of permanent virt. circuits (1 thru nvc) - * @t23 - time, in seconds - * @flags - see dosx25.doc, in portuguese, for details - */ -struct cycx_x25_config { - u8 link; - u8 speed; - u8 clock; - u8 n2; - u8 n2win; - u8 n3win; - u8 nvc; - u8 pktlen; - u8 locaddr; - u8 remaddr; - u16 t1; - u16 t2; - u8 t21; - u8 npvc; - u8 t23; - u8 flags; -} PACKED; - -struct cycx_x25_stats { - u16 rx_crc_errors; - u16 rx_over_errors; - u16 n2_tx_frames; - u16 n2_rx_frames; - u16 tx_timeouts; - u16 rx_timeouts; - u16 n3_tx_packets; - u16 n3_rx_packets; - u16 tx_aborts; - u16 rx_aborts; -} PACKED; -#endif /* _CYCX_X25_H */ -- cgit v1.2.3-59-g8ed1b From fbd74659d4513816a6249b0db491e8d831803520 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 28 Aug 2014 13:44:32 +0200 Subject: include/linux/i82593.h: Remove unused header The header file include/linux/i82593.h does not seem to be used anywhere. It was orphaned by 8a594170 "drivers/net: delete intel i825xx based znet notebook driver". Remove it. Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- include/linux/i82593.h | 229 ------------------------------------------------- 1 file changed, 229 deletions(-) delete mode 100644 include/linux/i82593.h (limited to 'include/linux') diff --git a/include/linux/i82593.h b/include/linux/i82593.h deleted file mode 100644 index afac5c7a323d..000000000000 --- a/include/linux/i82593.h +++ /dev/null @@ -1,229 +0,0 @@ -/* - * Definitions for Intel 82593 CSMA/CD Core LAN Controller - * The definitions are taken from the 1992 users manual with Intel - * order number 297125-001. - * - * /usr/src/pc/RCS/i82593.h,v 1.1 1996/07/17 15:23:12 root Exp - * - * Copyright 1994, Anders Klemets - * - * HISTORY - * i82593.h,v - * Revision 1.4 2005/11/4 09:15:00 baroniunas - * Modified copyright with permission of author as follows: - * - * "If I82539.H is the only file with my copyright statement - * that is included in the Source Forge project, then you have - * my approval to change the copyright statement to be a GPL - * license, in the way you proposed on October 10." - * - * Revision 1.1 1996/07/17 15:23:12 root - * Initial revision - * - * Revision 1.3 1995/04/05 15:13:58 adj - * Initial alpha release - * - * Revision 1.2 1994/06/16 23:57:31 klemets - * Mirrored all the fields in the configuration block. - * - * Revision 1.1 1994/06/02 20:25:34 klemets - * Initial revision - * - * - */ -#ifndef _I82593_H -#define _I82593_H - -/* Intel 82593 CSMA/CD Core LAN Controller */ - -/* Port 0 Command Register definitions */ - -/* Execution operations */ -#define OP0_NOP 0 /* CHNL = 0 */ -#define OP0_SWIT_TO_PORT_1 0 /* CHNL = 1 */ -#define OP0_IA_SETUP 1 -#define OP0_CONFIGURE 2 -#define OP0_MC_SETUP 3 -#define OP0_TRANSMIT 4 -#define OP0_TDR 5 -#define OP0_DUMP 6 -#define OP0_DIAGNOSE 7 -#define OP0_TRANSMIT_NO_CRC 9 -#define OP0_RETRANSMIT 12 -#define OP0_ABORT 13 -/* Reception operations */ -#define OP0_RCV_ENABLE 8 -#define OP0_RCV_DISABLE 10 -#define OP0_STOP_RCV 11 -/* Status pointer control operations */ -#define OP0_FIX_PTR 15 /* CHNL = 1 */ -#define OP0_RLS_PTR 15 /* CHNL = 0 */ -#define OP0_RESET 14 - -#define CR0_CHNL (1 << 4) /* 0=Channel 0, 1=Channel 1 */ -#define CR0_STATUS_0 0x00 -#define CR0_STATUS_1 0x20 -#define CR0_STATUS_2 0x40 -#define CR0_STATUS_3 0x60 -#define CR0_INT_ACK (1 << 7) /* 0=No ack, 1=acknowledge */ - -/* Port 0 Status Register definitions */ - -#define SR0_NO_RESULT 0 /* dummy */ -#define SR0_EVENT_MASK 0x0f -#define SR0_IA_SETUP_DONE 1 -#define SR0_CONFIGURE_DONE 2 -#define SR0_MC_SETUP_DONE 3 -#define SR0_TRANSMIT_DONE 4 -#define SR0_TDR_DONE 5 -#define SR0_DUMP_DONE 6 -#define SR0_DIAGNOSE_PASSED 7 -#define SR0_TRANSMIT_NO_CRC_DONE 9 -#define SR0_RETRANSMIT_DONE 12 -#define SR0_EXECUTION_ABORTED 13 -#define SR0_END_OF_FRAME 8 -#define SR0_RECEPTION_ABORTED 10 -#define SR0_DIAGNOSE_FAILED 15 -#define SR0_STOP_REG_HIT 11 - -#define SR0_CHNL (1 << 4) -#define SR0_EXECUTION (1 << 5) -#define SR0_RECEPTION (1 << 6) -#define SR0_INTERRUPT (1 << 7) -#define SR0_BOTH_RX_TX (SR0_EXECUTION | SR0_RECEPTION) - -#define SR3_EXEC_STATE_MASK 0x03 -#define SR3_EXEC_IDLE 0 -#define SR3_TX_ABORT_IN_PROGRESS 1 -#define SR3_EXEC_ACTIVE 2 -#define SR3_ABORT_IN_PROGRESS 3 -#define SR3_EXEC_CHNL (1 << 2) -#define SR3_STP_ON_NO_RSRC (1 << 3) -#define SR3_RCVING_NO_RSRC (1 << 4) -#define SR3_RCV_STATE_MASK 0x60 -#define SR3_RCV_IDLE 0x00 -#define SR3_RCV_READY 0x20 -#define SR3_RCV_ACTIVE 0x40 -#define SR3_RCV_STOP_IN_PROG 0x60 -#define SR3_RCV_CHNL (1 << 7) - -/* Port 1 Command Register definitions */ - -#define OP1_NOP 0 -#define OP1_SWIT_TO_PORT_0 1 -#define OP1_INT_DISABLE 2 -#define OP1_INT_ENABLE 3 -#define OP1_SET_TS 5 -#define OP1_RST_TS 7 -#define OP1_POWER_DOWN 8 -#define OP1_RESET_RING_MNGMT 11 -#define OP1_RESET 14 -#define OP1_SEL_RST 15 - -#define CR1_STATUS_4 0x00 -#define CR1_STATUS_5 0x20 -#define CR1_STATUS_6 0x40 -#define CR1_STOP_REG_UPDATE (1 << 7) - -/* Receive frame status bits */ - -#define RX_RCLD (1 << 0) -#define RX_IA_MATCH (1 << 1) -#define RX_NO_AD_MATCH (1 << 2) -#define RX_NO_SFD (1 << 3) -#define RX_SRT_FRM (1 << 7) -#define RX_OVRRUN (1 << 8) -#define RX_ALG_ERR (1 << 10) -#define RX_CRC_ERR (1 << 11) -#define RX_LEN_ERR (1 << 12) -#define RX_RCV_OK (1 << 13) -#define RX_TYP_LEN (1 << 15) - -/* Transmit status bits */ - -#define TX_NCOL_MASK 0x0f -#define TX_FRTL (1 << 4) -#define TX_MAX_COL (1 << 5) -#define TX_HRT_BEAT (1 << 6) -#define TX_DEFER (1 << 7) -#define TX_UND_RUN (1 << 8) -#define TX_LOST_CTS (1 << 9) -#define TX_LOST_CRS (1 << 10) -#define TX_LTCOL (1 << 11) -#define TX_OK (1 << 13) -#define TX_COLL (1 << 15) - -struct i82593_conf_block { - u_char fifo_limit : 4, - forgnesi : 1, - fifo_32 : 1, - d6mod : 1, - throttle_enb : 1; - u_char throttle : 6, - cntrxint : 1, - contin : 1; - u_char addr_len : 3, - acloc : 1, - preamb_len : 2, - loopback : 2; - u_char lin_prio : 3, - tbofstop : 1, - exp_prio : 3, - bof_met : 1; - u_char : 4, - ifrm_spc : 4; - u_char : 5, - slottim_low : 3; - u_char slottim_hi : 3, - : 1, - max_retr : 4; - u_char prmisc : 1, - bc_dis : 1, - : 1, - crs_1 : 1, - nocrc_ins : 1, - crc_1632 : 1, - : 1, - crs_cdt : 1; - u_char cs_filter : 3, - crs_src : 1, - cd_filter : 3, - : 1; - u_char : 2, - min_fr_len : 6; - u_char lng_typ : 1, - lng_fld : 1, - rxcrc_xf : 1, - artx : 1, - sarec : 1, - tx_jabber : 1, /* why is this called max_len in the manual? */ - hash_1 : 1, - lbpkpol : 1; - u_char : 6, - fdx : 1, - : 1; - u_char dummy_6 : 6, /* supposed to be ones */ - mult_ia : 1, - dis_bof : 1; - u_char dummy_1 : 1, /* supposed to be one */ - tx_ifs_retrig : 2, - mc_all : 1, - rcv_mon : 2, - frag_acpt : 1, - tstrttrs : 1; - u_char fretx : 1, - runt_eop : 1, - hw_sw_pin : 1, - big_endn : 1, - syncrqs : 1, - sttlen : 1, - tx_eop : 1, - rx_eop : 1; - u_char rbuf_size : 5, - rcvstop : 1, - : 2; -}; - -#define I82593_MAX_MULTICAST_ADDRESSES 128 /* Hardware hashed filter */ - -#endif /* _I82593_H */ -- cgit v1.2.3-59-g8ed1b From 6fb7c3778f0fba0bad099c30e834c413c4f8bcb5 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 28 Aug 2014 13:44:33 +0200 Subject: include/linux/phonedev.h: Remove unused header The header file include/linux/phonedev.h does not seem to be used anywhere. It was orphaned by 7326446c "Staging: remove telephony drivers". Remove it. Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- include/linux/phonedev.h | 25 ------------------------- 1 file changed, 25 deletions(-) delete mode 100644 include/linux/phonedev.h (limited to 'include/linux') diff --git a/include/linux/phonedev.h b/include/linux/phonedev.h deleted file mode 100644 index 4269de99e320..000000000000 --- a/include/linux/phonedev.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef __LINUX_PHONEDEV_H -#define __LINUX_PHONEDEV_H - -#include - -#ifdef __KERNEL__ - -#include - -struct phone_device { - struct phone_device *next; - const struct file_operations *f_op; - int (*open) (struct phone_device *, struct file *); - int board; /* Device private index */ - int minor; -}; - -extern int phonedev_init(void); -#define PHONE_MAJOR 100 -extern int phone_register_device(struct phone_device *, int unit); -#define PHONE_UNIT_ANY -1 -extern void phone_unregister_device(struct phone_device *); - -#endif -#endif -- cgit v1.2.3-59-g8ed1b From de20fe8e2cc3c4ca13fdb529e6720d9d199333fe Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 27 Aug 2014 21:26:35 -0700 Subject: net: Allocate a new 16 bits for flags in skbuff Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b69b7b512c06..3c9574c80933 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -598,6 +598,10 @@ struct sk_buff { __u32 reserved_tailroom; }; + kmemcheck_bitfield_begin(flags3); + /* 16 bit hole */ + kmemcheck_bitfield_end(flags3); + __be16 inner_protocol; __u16 inner_transport_header; __u16 inner_network_header; -- cgit v1.2.3-59-g8ed1b From 77cffe23c1f88835f6bd7b47bfa0c060c2969828 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 27 Aug 2014 21:26:46 -0700 Subject: net: Clarification of CHECKSUM_UNNECESSARY This patch: - Clarifies the specific requirements of devices returning CHECKSUM_UNNECESSARY (comments in skbuff.h). - Adds csum_level field to skbuff. This is used to express how many checksums are covered by CHECKSUM_UNNECESSARY (stores n - 1). This replaces the overloading of skb->encapsulation, that field is is now only used to indicate inner headers are valid. - Change __skb_checksum_validate_needed to "consume" each checksum as indicated by csum_level as layers of the the packet are parsed. - Remove skb_pop_rcv_encapsulation, no longer needed in the new csum_level model. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 2 -- include/linux/skbuff.h | 74 ++++++++++++++++++++++++++++++++++---------------- net/ipv4/gre_demux.c | 1 - 3 files changed, 51 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index beb377b2d4b7..67527f3d3be2 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1158,8 +1158,6 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) if (!vs) goto drop; - skb_pop_rcv_encapsulation(skb); - vs->rcv(vs, skb, vxh->vx_vni); return 0; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3c9574c80933..c93b5859a772 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -47,11 +47,29 @@ * * The hardware you're dealing with doesn't calculate the full checksum * (as in CHECKSUM_COMPLETE), but it does parse headers and verify checksums - * for specific protocols e.g. TCP/UDP/SCTP, then, for such packets it will - * set CHECKSUM_UNNECESSARY if their checksums are okay. skb->csum is still - * undefined in this case though. It is a bad option, but, unfortunately, - * nowadays most vendors do this. Apparently with the secret goal to sell - * you new devices, when you will add new protocol to your host, f.e. IPv6 8) + * for specific protocols. For such packets it will set CHECKSUM_UNNECESSARY + * if their checksums are okay. skb->csum is still undefined in this case + * though. It is a bad option, but, unfortunately, nowadays most vendors do + * this. Apparently with the secret goal to sell you new devices, when you + * will add new protocol to your host, f.e. IPv6 8) + * + * CHECKSUM_UNNECESSARY is applicable to following protocols: + * TCP: IPv6 and IPv4. + * UDP: IPv4 and IPv6. A device may apply CHECKSUM_UNNECESSARY to a + * zero UDP checksum for either IPv4 or IPv6, the networking stack + * may perform further validation in this case. + * GRE: only if the checksum is present in the header. + * SCTP: indicates the CRC in SCTP header has been validated. + * + * skb->csum_level indicates the number of consecutive checksums found in + * the packet minus one that have been verified as CHECKSUM_UNNECESSARY. + * For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet + * and a device is able to verify the checksums for UDP (possibly zero), + * GRE (checksum flag is set), and TCP-- skb->csum_level would be set to + * two. If the device were only able to verify the UDP checksum and not + * GRE, either because it doesn't support GRE checksum of because GRE + * checksum is bad, skb->csum_level would be set to zero (TCP checksum is + * not considered in this case). * * CHECKSUM_COMPLETE: * @@ -112,6 +130,9 @@ #define CHECKSUM_COMPLETE 2 #define CHECKSUM_PARTIAL 3 +/* Maximum value in skb->csum_level */ +#define SKB_MAX_CSUM_LEVEL 3 + #define SKB_DATA_ALIGN(X) ALIGN(X, SMP_CACHE_BYTES) #define SKB_WITH_OVERHEAD(X) \ ((X) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) @@ -571,11 +592,7 @@ struct sk_buff { __u8 wifi_acked:1; __u8 no_fcs:1; __u8 head_frag:1; - /* Encapsulation protocol and NIC drivers should use - * this flag to indicate to each other if the skb contains - * encapsulated packet or not and maybe use the inner packet - * headers if needed - */ + /* Indicates the inner headers are valid in the skbuff. */ __u8 encapsulation:1; __u8 encap_hdr_csum:1; __u8 csum_valid:1; @@ -599,7 +616,8 @@ struct sk_buff { }; kmemcheck_bitfield_begin(flags3); - /* 16 bit hole */ + __u8 csum_level:2; + /* 14 bit hole */ kmemcheck_bitfield_end(flags3); __be16 inner_protocol; @@ -1866,18 +1884,6 @@ static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len) return pskb_may_pull(skb, skb_network_offset(skb) + len); } -static inline void skb_pop_rcv_encapsulation(struct sk_buff *skb) -{ - /* Only continue with checksum unnecessary if device indicated - * it is valid across encapsulation (skb->encapsulation was set). - */ - if (skb->ip_summed == CHECKSUM_UNNECESSARY && !skb->encapsulation) - skb->ip_summed = CHECKSUM_NONE; - - skb->encapsulation = 0; - skb->csum_valid = 0; -} - /* * CPUs often take a performance hit when accessing unaligned memory * locations. The actual performance hit varies, it can be small if the @@ -2798,6 +2804,27 @@ static inline __sum16 skb_checksum_complete(struct sk_buff *skb) 0 : __skb_checksum_complete(skb); } +static inline void __skb_decr_checksum_unnecessary(struct sk_buff *skb) +{ + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + if (skb->csum_level == 0) + skb->ip_summed = CHECKSUM_NONE; + else + skb->csum_level--; + } +} + +static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb) +{ + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + if (skb->csum_level < SKB_MAX_CSUM_LEVEL) + skb->csum_level++; + } else if (skb->ip_summed == CHECKSUM_NONE) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->csum_level = 0; + } +} + /* Check if we need to perform checksum complete validation. * * Returns true if checksum complete is needed, false otherwise @@ -2809,6 +2836,7 @@ static inline bool __skb_checksum_validate_needed(struct sk_buff *skb, { if (skb_csum_unnecessary(skb) || (zero_okay && !check)) { skb->csum_valid = 1; + __skb_decr_checksum_unnecessary(skb); return false; } diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 7c1a8ff974dd..0485bf7f8f03 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -125,7 +125,6 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, *csum_err = true; return -EINVAL; } - skb_pop_rcv_encapsulation(skb); options++; } -- cgit v1.2.3-59-g8ed1b From 662880f4420340aad4f9a62a349c6c9d4faa1a5d Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 27 Aug 2014 21:26:56 -0700 Subject: net: Allow GRO to use and set levels of checksum unnecessary Allow GRO path to "consume" checksums provided in CHECKSUM_UNNECESSARY and to report new checksums verfied for use in fallback to normal path. Change GRO checksum path to track csum_level using a csum_cnt field in NAPI_GRO_CB. On GRO initialization, if ip_summed is CHECKSUM_UNNECESSARY set NAPI_GRO_CB(skb)->csum_cnt to skb->csum_level + 1. For each checksum verified, decrement NAPI_GRO_CB(skb)->csum_cnt while its greater than zero. If a checksum is verfied and NAPI_GRO_CB(skb)->csum_cnt == 0, we have verified a deeper checksum than originally indicated in skbuf so increment csum_level (or initialize to CHECKSUM_UNNECESSARY if ip_summed is CHECKSUM_NONE or CHECKSUM_COMPLETE). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 26 ++++++++++++-------------- net/core/dev.c | 24 ++++++++++++++++-------- net/ipv4/gre_offload.c | 7 ++----- net/ipv4/udp_offload.c | 5 +++-- 4 files changed, 33 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dfc1d8b8bd0f..456eb1fe51e8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1883,8 +1883,8 @@ struct napi_gro_cb { /* GRO checksum is valid */ u8 csum_valid:1; - /* Number encapsulation layers crossed */ - u8 encapsulation; + /* Number of checksums via CHECKSUM_UNNECESSARY */ + u8 csum_cnt:3; /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; @@ -2179,8 +2179,7 @@ static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb, __sum16 check) { return (skb->ip_summed != CHECKSUM_PARTIAL && - (skb->ip_summed != CHECKSUM_UNNECESSARY || - (NAPI_GRO_CB(skb)->encapsulation > skb->encapsulation)) && + NAPI_GRO_CB(skb)->csum_cnt == 0 && (!zero_okay || check)); } @@ -2196,18 +2195,17 @@ static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb, return __skb_gro_checksum_complete(skb); } -/* Update skb for CHECKSUM_UNNECESSARY when we verified a top level - * checksum or an encapsulated one during GRO. This saves work - * if we fallback to normal path with the packet. - */ static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb) { - if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - if (NAPI_GRO_CB(skb)->encapsulation) - skb->encapsulation = 1; - } else if (skb->ip_summed != CHECKSUM_PARTIAL) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->encapsulation = 0; + if (NAPI_GRO_CB(skb)->csum_cnt > 0) { + /* Consume a checksum from CHECKSUM_UNNECESSARY */ + NAPI_GRO_CB(skb)->csum_cnt--; + } else { + /* Update skb for CHECKSUM_UNNECESSARY and csum_level when we + * verified a new top level checksum or an encapsulated one + * during GRO. This saves work if we fallback to normal path. + */ + __skb_incr_checksum_unnecessary(skb); } } diff --git a/net/core/dev.c b/net/core/dev.c index 26d296c2447c..a6077ef56345 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3962,13 +3962,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff gro_list_prepare(napi, skb); - if (skb->ip_summed == CHECKSUM_COMPLETE) { - NAPI_GRO_CB(skb)->csum = skb->csum; - NAPI_GRO_CB(skb)->csum_valid = 1; - } else { - NAPI_GRO_CB(skb)->csum_valid = 0; - } - rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { if (ptype->type != type || !ptype->callbacks.gro_receive) @@ -3980,7 +3973,22 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->flush = 0; NAPI_GRO_CB(skb)->free = 0; NAPI_GRO_CB(skb)->udp_mark = 0; - NAPI_GRO_CB(skb)->encapsulation = 0; + + /* Setup for GRO checksum validation */ + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + NAPI_GRO_CB(skb)->csum = skb->csum; + NAPI_GRO_CB(skb)->csum_valid = 1; + NAPI_GRO_CB(skb)->csum_cnt = 0; + break; + case CHECKSUM_UNNECESSARY: + NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; + NAPI_GRO_CB(skb)->csum_valid = 0; + break; + default: + NAPI_GRO_CB(skb)->csum_cnt = 0; + NAPI_GRO_CB(skb)->csum_valid = 0; + } pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); break; diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index d1bd16937d93..a4d7965fb880 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -172,12 +172,9 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, } /* Don't bother verifying checksum if we're going to flush anyway. */ - if (greh->flags & GRE_CSUM) { - if (!NAPI_GRO_CB(skb)->flush && - skb_gro_checksum_simple_validate(skb)) + if ((greh->flags & GRE_CSUM) && !NAPI_GRO_CB(skb)->flush && + skb_gro_checksum_simple_validate(skb)) goto out_unlock; - NAPI_GRO_CB(skb)->encapsulation++; - } flush = 0; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 8ed460e3753c..a6adff98382a 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -238,12 +238,13 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, int flush = 1; if (NAPI_GRO_CB(skb)->udp_mark || - (!skb->encapsulation && !NAPI_GRO_CB(skb)->csum_valid)) + (skb->ip_summed != CHECKSUM_PARTIAL && + NAPI_GRO_CB(skb)->csum_cnt == 0 && + !NAPI_GRO_CB(skb)->csum_valid)) goto out; /* mark that this skb passed once through the udp gro layer */ NAPI_GRO_CB(skb)->udp_mark = 1; - NAPI_GRO_CB(skb)->encapsulation++; rcu_read_lock(); uo_priv = rcu_dereference(udp_offload_base); -- cgit v1.2.3-59-g8ed1b From 10b3ad8c21bb4b135768c30dd4c51a1c744da699 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 29 Aug 2014 21:07:24 -0700 Subject: net: Do txq_trans_update() in netdev_start_xmit() That way we don't have to audit every call site to make sure it is doing this properly. Signed-off-by: David S. Miller --- drivers/net/wan/dlci.c | 6 ++++-- include/linux/netdevice.h | 10 ++++++++-- net/core/dev.c | 7 ++----- net/core/netpoll.c | 4 +--- net/core/pktgen.c | 3 +-- net/packet/af_packet.c | 7 ++----- net/sched/sch_teql.c | 3 +-- 7 files changed, 19 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/wan/dlci.c b/drivers/net/wan/dlci.c index 81b22a180aad..6427e8283419 100644 --- a/drivers/net/wan/dlci.c +++ b/drivers/net/wan/dlci.c @@ -192,8 +192,10 @@ static netdev_tx_t dlci_transmit(struct sk_buff *skb, struct net_device *dev) { struct dlci_local *dlp = netdev_priv(dev); - if (skb) - netdev_start_xmit(skb, dlp->slave); + if (skb) { + struct netdev_queue *txq = skb_get_tx_queue(dev, skb); + netdev_start_xmit(skb, dlp->slave, txq); + } return NETDEV_TX_OK; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 456eb1fe51e8..16171802ea7d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3437,11 +3437,17 @@ static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, return ops->ndo_start_xmit(skb, dev); } -static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev) +static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq) { const struct net_device_ops *ops = dev->netdev_ops; + int rc; - return __netdev_start_xmit(ops, skb, dev); + rc = __netdev_start_xmit(ops, skb, dev); + if (rc == NETDEV_TX_OK) + txq_trans_update(txq); + + return rc; } int netdev_class_create_file_ns(struct class_attribute *class_attr, diff --git a/net/core/dev.c b/net/core/dev.c index a6077ef56345..6392adaaa22f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2666,10 +2666,8 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, skb_len = skb->len; trace_net_dev_start_xmit(skb, dev); - rc = netdev_start_xmit(skb, dev); + rc = netdev_start_xmit(skb, dev, txq); trace_net_dev_xmit(skb, rc, dev, skb_len); - if (rc == NETDEV_TX_OK) - txq_trans_update(txq); return rc; } @@ -2685,7 +2683,7 @@ gso: skb_len = nskb->len; trace_net_dev_start_xmit(nskb, dev); - rc = netdev_start_xmit(nskb, dev); + rc = netdev_start_xmit(nskb, dev, txq); trace_net_dev_xmit(nskb, rc, dev, skb_len); if (unlikely(rc != NETDEV_TX_OK)) { if (rc & ~NETDEV_TX_MASK) @@ -2694,7 +2692,6 @@ gso: skb->next = nskb; return rc; } - txq_trans_update(txq); if (unlikely(netif_xmit_stopped(txq) && skb->next)) return NETDEV_TX_BUSY; } while (skb->next); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 12b1df976562..05bc57edaa81 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -91,9 +91,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, skb->vlan_tci = 0; } - status = netdev_start_xmit(skb, dev); - if (status == NETDEV_TX_OK) - txq_trans_update(txq); + status = netdev_start_xmit(skb, dev, txq); out: return status; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index d81b540096c3..34bd2ff9f121 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3335,11 +3335,10 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) goto unlock; } atomic_inc(&(pkt_dev->skb->users)); - ret = netdev_start_xmit(pkt_dev->skb, odev); + ret = netdev_start_xmit(pkt_dev->skb, odev, txq); switch (ret) { case NETDEV_TX_OK: - txq_trans_update(txq); pkt_dev->last_ok = 1; pkt_dev->sofar++; pkt_dev->seq_num++; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b7a7f5a721bd..fe305a05a8fc 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -258,11 +258,8 @@ static int packet_direct_xmit(struct sk_buff *skb) local_bh_disable(); HARD_TX_LOCK(dev, txq, smp_processor_id()); - if (!netif_xmit_frozen_or_drv_stopped(txq)) { - ret = netdev_start_xmit(skb, dev); - if (ret == NETDEV_TX_OK) - txq_trans_update(txq); - } + if (!netif_xmit_frozen_or_drv_stopped(txq)) + ret = netdev_start_xmit(skb, dev, txq); HARD_TX_UNLOCK(dev, txq); local_bh_enable(); diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 64cd93ca8104..193dc2cba1ec 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -316,8 +316,7 @@ restart: unsigned int length = qdisc_pkt_len(skb); if (!netif_xmit_frozen_or_stopped(slave_txq) && - netdev_start_xmit(skb, slave) == NETDEV_TX_OK) { - txq_trans_update(slave_txq); + netdev_start_xmit(skb, slave, slave_txq) == NETDEV_TX_OK) { __netif_tx_unlock(slave_txq); master->slaves = NEXT_SLAVE(q); netif_wake_queue(dev); -- cgit v1.2.3-59-g8ed1b From fa2dbdc253c2aee2a760c64de454cb62469ec11d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 29 Aug 2014 21:55:22 -0700 Subject: net: Pass a "more" indication down into netdev_start_xmit() code paths. For now it will always be false. Signed-off-by: David S. Miller --- drivers/net/wan/dlci.c | 2 +- include/linux/netdevice.h | 9 +++++---- net/atm/mpc.c | 2 +- net/core/dev.c | 2 +- net/core/netpoll.c | 2 +- net/core/pktgen.c | 2 +- net/packet/af_packet.c | 2 +- net/sched/sch_teql.c | 3 ++- 8 files changed, 13 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/wan/dlci.c b/drivers/net/wan/dlci.c index 6427e8283419..ae6ecf401189 100644 --- a/drivers/net/wan/dlci.c +++ b/drivers/net/wan/dlci.c @@ -194,7 +194,7 @@ static netdev_tx_t dlci_transmit(struct sk_buff *skb, struct net_device *dev) if (skb) { struct netdev_queue *txq = skb_get_tx_queue(dev, skb); - netdev_start_xmit(skb, dlp->slave, txq); + netdev_start_xmit(skb, dlp->slave, txq, false); } return NETDEV_TX_OK; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 16171802ea7d..5050218c5b7f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3431,19 +3431,20 @@ int __init dev_proc_init(void); #endif static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, - struct sk_buff *skb, struct net_device *dev) + struct sk_buff *skb, struct net_device *dev, + bool more) { - skb->xmit_more = 0; + skb->xmit_more = more ? 1 : 0; return ops->ndo_start_xmit(skb, dev); } static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) + struct netdev_queue *txq, bool more) { const struct net_device_ops *ops = dev->netdev_ops; int rc; - rc = __netdev_start_xmit(ops, skb, dev); + rc = __netdev_start_xmit(ops, skb, dev, more); if (rc == NETDEV_TX_OK) txq_trans_update(txq); diff --git a/net/atm/mpc.c b/net/atm/mpc.c index d662da161e5a..0e982222d425 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -599,7 +599,7 @@ static netdev_tx_t mpc_send_packet(struct sk_buff *skb, } non_ip: - return __netdev_start_xmit(mpc->old_ops, skb, dev); + return __netdev_start_xmit(mpc->old_ops, skb, dev, false); } static int atm_mpoa_vcc_attach(struct atm_vcc *vcc, void __user *arg) diff --git a/net/core/dev.c b/net/core/dev.c index ab7bb809711e..f0ed5a611a97 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2610,7 +2610,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, len = skb->len; trace_net_dev_start_xmit(skb, dev); - rc = netdev_start_xmit(skb, dev, txq); + rc = netdev_start_xmit(skb, dev, txq, false); trace_net_dev_xmit(skb, rc, dev, len); return rc; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 05bc57edaa81..e6645b4f330a 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -91,7 +91,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, skb->vlan_tci = 0; } - status = netdev_start_xmit(skb, dev, txq); + status = netdev_start_xmit(skb, dev, txq, false); out: return status; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 34bd2ff9f121..5b36a9428c59 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3335,7 +3335,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) goto unlock; } atomic_inc(&(pkt_dev->skb->users)); - ret = netdev_start_xmit(pkt_dev->skb, odev, txq); + ret = netdev_start_xmit(pkt_dev->skb, odev, txq, false); switch (ret) { case NETDEV_TX_OK: diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index fe305a05a8fc..87d20f48ff06 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -259,7 +259,7 @@ static int packet_direct_xmit(struct sk_buff *skb) HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_drv_stopped(txq)) - ret = netdev_start_xmit(skb, dev, txq); + ret = netdev_start_xmit(skb, dev, txq, false); HARD_TX_UNLOCK(dev, txq); local_bh_enable(); diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 193dc2cba1ec..aaa8d03ed054 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -316,7 +316,8 @@ restart: unsigned int length = qdisc_pkt_len(skb); if (!netif_xmit_frozen_or_stopped(slave_txq) && - netdev_start_xmit(skb, slave, slave_txq) == NETDEV_TX_OK) { + netdev_start_xmit(skb, slave, slave_txq, false) == + NETDEV_TX_OK) { __netif_tx_unlock(slave_txq); master->slaves = NEXT_SLAVE(q); netif_wake_queue(dev); -- cgit v1.2.3-59-g8ed1b From 50cbe9ab5f8d92d2d4a327b56e96559d8f63a1fa Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 30 Aug 2014 19:13:51 -0700 Subject: net: Validate xmit SKBs right when we pull them out of the qdisc. Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 6 +----- net/sched/sch_generic.c | 5 ++++- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5050218c5b7f..47c49ba2dcf4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2827,6 +2827,7 @@ int dev_set_mac_address(struct net_device *, struct sockaddr *); int dev_change_carrier(struct net_device *, bool new_carrier); int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_port_id *ppid); +struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev); int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index 704a5434f77d..75bc5b068a13 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2656,7 +2656,7 @@ struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t featur return skb; } -static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) +struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) { netdev_features_t features; @@ -2719,10 +2719,6 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, { int rc = NETDEV_TX_OK; - skb = validate_xmit_skb(skb, dev); - if (!skb) - return rc; - if (likely(!skb->next)) return xmit_one(skb, dev, txq, false); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 05b3f5d104af..f178798a5836 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -70,8 +70,11 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q) } else skb = NULL; } else { - if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq)) + if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq)) { skb = q->dequeue(q); + if (skb) + skb = validate_xmit_skb(skb, qdisc_dev(q)); + } } return skb; -- cgit v1.2.3-59-g8ed1b From ce93718fb7cdbc064c3000ff59e4d3200bdfa744 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 30 Aug 2014 19:22:20 -0700 Subject: net: Don't keep around original SKB when we software segment GSO frames. Just maintain the list properly by returning the head of the remaining SKB list from dev_hard_start_xmit(). Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +-- net/core/dev.c | 79 +++++++++-------------------------------------- net/sched/sch_generic.c | 2 +- 3 files changed, 17 insertions(+), 68 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 47c49ba2dcf4..202c25a9aadf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2828,8 +2828,8 @@ int dev_change_carrier(struct net_device *, bool new_carrier); int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_port_id *ppid); struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev); -int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq); +struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq, int *ret); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index 75bc5b068a13..c89da4f306b1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2485,52 +2485,6 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) return 0; } -struct dev_gso_cb { - void (*destructor)(struct sk_buff *skb); -}; - -#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) - -static void dev_gso_skb_destructor(struct sk_buff *skb) -{ - struct dev_gso_cb *cb; - - kfree_skb_list(skb->next); - skb->next = NULL; - - cb = DEV_GSO_CB(skb); - if (cb->destructor) - cb->destructor(skb); -} - -/** - * dev_gso_segment - Perform emulated hardware segmentation on skb. - * @skb: buffer to segment - * @features: device features as applicable to this skb - * - * This function segments the given skb and stores the list of segments - * in skb->next. - */ -static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) -{ - struct sk_buff *segs; - - segs = skb_gso_segment(skb, features); - - /* Verifying header integrity only. */ - if (!segs) - return 0; - - if (IS_ERR(segs)) - return PTR_ERR(segs); - - skb->next = segs; - DEV_GSO_CB(skb)->destructor = skb->destructor; - skb->destructor = dev_gso_skb_destructor; - - return 0; -} - /* If MPLS offload request, verify we are testing hardware MPLS features * instead of standard features for the netdev. */ @@ -2682,8 +2636,13 @@ struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) features &= dev->hw_enc_features; if (netif_needs_gso(skb, features)) { - if (unlikely(dev_gso_segment(skb, features))) - goto out_kfree_skb; + struct sk_buff *segs; + + segs = skb_gso_segment(skb, features); + kfree_skb(skb); + if (IS_ERR(segs)) + segs = NULL; + skb = segs; } else { if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) @@ -2714,26 +2673,16 @@ out_null: return NULL; } -int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) +struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq, int *ret) { - int rc = NETDEV_TX_OK; - - if (likely(!skb->next)) - return xmit_one(skb, dev, txq, false); - - skb->next = xmit_list(skb->next, dev, txq, &rc); - if (likely(skb->next == NULL)) { - skb->destructor = DEV_GSO_CB(skb)->destructor; - consume_skb(skb); - return rc; + if (likely(!skb->next)) { + *ret = xmit_one(skb, dev, txq, false); + return skb; } - kfree_skb(skb); - - return rc; + return xmit_list(skb, dev, txq, ret); } -EXPORT_SYMBOL_GPL(dev_hard_start_xmit); static void qdisc_pkt_len_init(struct sk_buff *skb) { @@ -2945,7 +2894,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) if (!netif_xmit_stopped(txq)) { __this_cpu_inc(xmit_recursion); - rc = dev_hard_start_xmit(skb, dev, txq); + skb = dev_hard_start_xmit(skb, dev, txq, &rc); __this_cpu_dec(xmit_recursion); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index f178798a5836..a8bf9f9928bd 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -129,7 +129,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_stopped(txq)) - ret = dev_hard_start_xmit(skb, dev, txq); + skb = dev_hard_start_xmit(skb, dev, txq, &ret); HARD_TX_UNLOCK(dev, txq); -- cgit v1.2.3-59-g8ed1b From 5a21232983aa7acfe7fd26170832a9e0a4a7b4ae Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 31 Aug 2014 15:12:41 -0700 Subject: net: Support for csum_bad in skbuff This flag indicates that an invalid checksum was detected in the packet. __skb_mark_checksum_bad helper function was added to set this. Checksums can be marked bad from a driver or the GRO path (the latter is implemented in this patch). csum_bad is checked in __skb_checksum_validate_complete (i.e. calling that when ip_summed == CHECKSUM_NONE). csum_bad works in conjunction with ip_summed value. In the case that ip_summed is CHECKSUM_NONE and csum_bad is set, this implies that the first (or next) checksum encountered in the packet is bad. When ip_summed is CHECKSUM_UNNECESSARY, the first checksum after the last one validated is bad. For example, if ip_summed == CHECKSUM_UNNECESSARY, csum_level == 1, and csum_bad is set-- then the third checksum in the packet is bad. In the normal path, the packet will be dropped when processing the protocol layer of the bad checksum: __skb_decr_checksum_unnecessary called twice for the good checksums changing ip_summed to CHECKSUM_NONE so that __skb_checksum_validate_complete is called to validate the third checksum and that will fail since csum_bad is set. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +++- include/linux/skbuff.h | 21 ++++++++++++++++++++- net/core/dev.c | 2 +- 3 files changed, 24 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 202c25a9aadf..a0ab6d9d400a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2216,7 +2216,9 @@ static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb) if (__skb_gro_checksum_validate_needed(skb, zero_okay, check)) \ __ret = __skb_gro_checksum_validate_complete(skb, \ compute_pseudo(skb, proto)); \ - if (!__ret) \ + if (__ret) \ + __skb_mark_checksum_bad(skb); \ + else \ skb_gro_incr_csum_unnecessary(skb); \ __ret; \ }) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c93b5859a772..23710a243439 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -617,7 +617,8 @@ struct sk_buff { kmemcheck_bitfield_begin(flags3); __u8 csum_level:2; - /* 14 bit hole */ + __u8 csum_bad:1; + /* 13 bit hole */ kmemcheck_bitfield_end(flags3); __be16 inner_protocol; @@ -2825,6 +2826,21 @@ static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb) } } +static inline void __skb_mark_checksum_bad(struct sk_buff *skb) +{ + /* Mark current checksum as bad (typically called from GRO + * path). In the case that ip_summed is CHECKSUM_NONE + * this must be the first checksum encountered in the packet. + * When ip_summed is CHECKSUM_UNNECESSARY, this is the first + * checksum after the last one validated. For UDP, a zero + * checksum can not be marked as bad. + */ + + if (skb->ip_summed == CHECKSUM_NONE || + skb->ip_summed == CHECKSUM_UNNECESSARY) + skb->csum_bad = 1; +} + /* Check if we need to perform checksum complete validation. * * Returns true if checksum complete is needed, false otherwise @@ -2866,6 +2882,9 @@ static inline __sum16 __skb_checksum_validate_complete(struct sk_buff *skb, skb->csum_valid = 1; return 0; } + } else if (skb->csum_bad) { + /* ip_summed == CHECKSUM_NONE in this case */ + return 1; } skb->csum = psum; diff --git a/net/core/dev.c b/net/core/dev.c index 6857d57aa294..3774afc3bebf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3918,7 +3918,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (!(skb->dev->features & NETIF_F_GRO)) goto normal; - if (skb_is_gso(skb) || skb_has_frag_list(skb)) + if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad) goto normal; gro_list_prepare(napi, skb); -- cgit v1.2.3-59-g8ed1b From d96535a17dbbafd567961d14c08c0984ddda9c3c Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 31 Aug 2014 15:12:42 -0700 Subject: net: Infrastructure for checksum unnecessary conversions For normal path, added skb_checksum_try_convert which is called to attempt to convert CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE. The primary condition to allow this is that ip_summed is CHECKSUM_NONE and csum_valid is true, which will be the state after consuming a CHECKSUM_UNNECESSARY. For GRO path, added skb_gro_checksum_try_convert which is the GRO analogue of skb_checksum_try_convert. The primary condition to allow this is that NAPI_GRO_CB(skb)->csum_cnt == 0 and NAPI_GRO_CB(skb)->csum_valid is set. This implies that we have consumed all available CHECKSUM_UNNECESSARY checksums in the GRO path. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 20 ++++++++++++++++++++ include/linux/skbuff.h | 20 ++++++++++++++++++++ 2 files changed, 40 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a0ab6d9d400a..5be20a7bbb0d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2233,6 +2233,26 @@ static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb) #define skb_gro_checksum_simple_validate(skb) \ __skb_gro_checksum_validate(skb, 0, false, 0, null_compute_pseudo) +static inline bool __skb_gro_checksum_convert_check(struct sk_buff *skb) +{ + return (NAPI_GRO_CB(skb)->csum_cnt == 0 && + !NAPI_GRO_CB(skb)->csum_valid); +} + +static inline void __skb_gro_checksum_convert(struct sk_buff *skb, + __sum16 check, __wsum pseudo) +{ + NAPI_GRO_CB(skb)->csum = ~pseudo; + NAPI_GRO_CB(skb)->csum_valid = 1; +} + +#define skb_gro_checksum_try_convert(skb, proto, check, compute_pseudo) \ +do { \ + if (__skb_gro_checksum_convert_check(skb)) \ + __skb_gro_checksum_convert(skb, check, \ + compute_pseudo(skb, proto)); \ +} while (0) + static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 23710a243439..02529fcad1ac 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2942,6 +2942,26 @@ static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto) #define skb_checksum_simple_validate(skb) \ __skb_checksum_validate(skb, 0, true, false, 0, null_compute_pseudo) +static inline bool __skb_checksum_convert_check(struct sk_buff *skb) +{ + return (skb->ip_summed == CHECKSUM_NONE && + skb->csum_valid && !skb->csum_bad); +} + +static inline void __skb_checksum_convert(struct sk_buff *skb, + __sum16 check, __wsum pseudo) +{ + skb->csum = ~pseudo; + skb->ip_summed = CHECKSUM_COMPLETE; +} + +#define skb_checksum_try_convert(skb, proto, check, compute_pseudo) \ +do { \ + if (__skb_checksum_convert_check(skb)) \ + __skb_checksum_convert(skb, check, \ + compute_pseudo(skb, proto)); \ +} while (0) + #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) void nf_conntrack_destroy(struct nf_conntrack *nfct); static inline void nf_conntrack_put(struct nf_conntrack *nfct) -- cgit v1.2.3-59-g8ed1b From 2abb7cdc0dc84e99b76ef983a1ae1978922aa9b3 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 31 Aug 2014 15:12:43 -0700 Subject: udp: Add support for doing checksum unnecessary conversion Add support for doing CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion in UDP tunneling path. In the normal UDP path, we call skb_checksum_try_convert after locating the UDP socket. The check is that checksum conversion is enabled for the socket (new flag in UDP socket) and that checksum field is non-zero. In the UDP GRO path, we call skb_gro_checksum_try_convert after checksum is validated and checksum field is non-zero. Since this is already in GRO we assume that checksum conversion is always wanted. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/udp.h | 16 +++++++++++++++- net/ipv4/udp.c | 4 ++++ net/ipv4/udp_offload.c | 25 +++++++++++++++++-------- net/ipv6/udp.c | 4 ++++ net/ipv6/udp_offload.c | 24 +++++++++++++++++------- 5 files changed, 57 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index 247cfdcc4b08..ee3277593222 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -49,7 +49,11 @@ struct udp_sock { unsigned int corkflag; /* Cork is required */ __u8 encap_type; /* Is this an Encapsulation socket? */ unsigned char no_check6_tx:1,/* Send zero UDP6 checksums on TX? */ - no_check6_rx:1;/* Allow zero UDP6 checksums on RX? */ + no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */ + convert_csum:1;/* On receive, convert checksum + * unnecessary to checksum complete + * if possible. + */ /* * Following member retains the information to create a UDP header * when the socket is uncorked. @@ -98,6 +102,16 @@ static inline bool udp_get_no_check6_rx(struct sock *sk) return udp_sk(sk)->no_check6_rx; } +static inline void udp_set_convert_csum(struct sock *sk, bool val) +{ + udp_sk(sk)->convert_csum = val; +} + +static inline bool udp_get_convert_csum(struct sock *sk) +{ + return udp_sk(sk)->convert_csum; +} + #define udp_portaddr_for_each_entry(__sk, node, list) \ hlist_nulls_for_each_entry(__sk, node, list, __sk_common.skc_portaddr_node) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3549c21fe5f7..0da3849fd35b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1788,6 +1788,10 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (sk != NULL) { int ret; + if (udp_sk(sk)->convert_csum && uh->check && !IS_UDPLITE(sk)) + skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + inet_compute_pseudo); + ret = udp_queue_rcv_skb(sk, skb); sock_put(sk); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index a6adff98382a..84e0e05c9c0e 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -290,16 +290,25 @@ static struct sk_buff **udp4_gro_receive(struct sk_buff **head, { struct udphdr *uh = udp_gro_udphdr(skb); - /* Don't bother verifying checksum if we're going to flush anyway. */ - if (unlikely(!uh) || - (!NAPI_GRO_CB(skb)->flush && - skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, - inet_gro_compute_pseudo))) { - NAPI_GRO_CB(skb)->flush = 1; - return NULL; - } + if (unlikely(!uh)) + goto flush; + /* Don't bother verifying checksum if we're going to flush anyway. */ + if (!NAPI_GRO_CB(skb)->flush) + goto skip; + + if (skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, + inet_gro_compute_pseudo)) + goto flush; + else if (uh->check) + skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + inet_gro_compute_pseudo); +skip: return udp_gro_receive(head, skb, uh); + +flush: + NAPI_GRO_CB(skb)->flush = 1; + return NULL; } int udp_gro_complete(struct sk_buff *skb, int nhoff) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 12fcce8fba46..f6ba535b6feb 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -891,6 +891,10 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, goto csum_error; } + if (udp_sk(sk)->convert_csum && uh->check && !IS_UDPLITE(sk)) + skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + ip6_compute_pseudo); + ret = udpv6_queue_rcv_skb(sk, skb); sock_put(sk); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index b13e377e9c53..89cb9a9b8537 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -134,16 +134,26 @@ static struct sk_buff **udp6_gro_receive(struct sk_buff **head, { struct udphdr *uh = udp_gro_udphdr(skb); + if (unlikely(!uh)) + goto flush; + /* Don't bother verifying checksum if we're going to flush anyway. */ - if (unlikely(!uh) || - (!NAPI_GRO_CB(skb)->flush && - skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, - ip6_gro_compute_pseudo))) { - NAPI_GRO_CB(skb)->flush = 1; - return NULL; - } + if (!NAPI_GRO_CB(skb)->flush) + goto skip; + if (skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, + ip6_gro_compute_pseudo)) + goto flush; + else if (uh->check) + skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + ip6_gro_compute_pseudo); + +skip: return udp_gro_receive(head, skb, uh); + +flush: + NAPI_GRO_CB(skb)->flush = 1; + return NULL; } int udp6_gro_complete(struct sk_buff *skb, int nhoff) -- cgit v1.2.3-59-g8ed1b From 940001762ac514810e305aab356983829e5fa82a Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Wed, 3 Sep 2014 09:22:36 +0800 Subject: lib/rhashtable: allow user to set the minimum shifts of shrinking Although rhashtable library allows user to specify a quiet big size for user's created hash table, the table may be shrunk to a very small size - HASH_MIN_SIZE(4) after object is removed from the table at the first time. Subsequently, even if the total amount of objects saved in the table is quite lower than user's initial setting in a long time, the hash table size is still dynamically adjusted by rhashtable_shrink() or rhashtable_expand() each time object is inserted or removed from the table. However, as synchronize_rcu() has to be called when table is shrunk or expanded by the two functions, we should permit user to set the minimum table size through configuring the minimum number of shifts according to user specific requirement, avoiding these expensive actions of shrinking or expanding because of calling synchronize_rcu(). Signed-off-by: Ying Xue Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 2 ++ lib/rhashtable.c | 12 ++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 36826c0166c5..fb298e9d6d3a 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -44,6 +44,7 @@ struct rhashtable; * @head_offset: Offset of rhash_head in struct to be hashed * @hash_rnd: Seed to use while hashing * @max_shift: Maximum number of shifts while expanding + * @min_shift: Minimum number of shifts while shrinking * @hashfn: Function to hash key * @obj_hashfn: Function to hash object * @grow_decision: If defined, may return true if table should expand @@ -57,6 +58,7 @@ struct rhashtable_params { size_t head_offset; u32 hash_rnd; size_t max_shift; + size_t min_shift; rht_hashfn_t hashfn; rht_obj_hashfn_t obj_hashfn; bool (*grow_decision)(const struct rhashtable *ht, diff --git a/lib/rhashtable.c b/lib/rhashtable.c index a2c78810ebc1..8dfec3f26d4c 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -298,7 +298,7 @@ int rhashtable_shrink(struct rhashtable *ht, gfp_t flags) ASSERT_RHT_MUTEX(ht); - if (tbl->size <= HASH_MIN_SIZE) + if (ht->shift <= ht->p.min_shift) return 0; ntbl = bucket_table_alloc(tbl->size / 2, flags); @@ -506,9 +506,10 @@ void *rhashtable_lookup_compare(const struct rhashtable *ht, u32 hash, } EXPORT_SYMBOL_GPL(rhashtable_lookup_compare); -static size_t rounded_hashtable_size(unsigned int nelem) +static size_t rounded_hashtable_size(struct rhashtable_params *params) { - return max(roundup_pow_of_two(nelem * 4 / 3), HASH_MIN_SIZE); + return max(roundup_pow_of_two(params->nelem_hint * 4 / 3), + 1UL << params->min_shift); } /** @@ -566,8 +567,11 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params) (!params->key_len && !params->obj_hashfn)) return -EINVAL; + params->min_shift = max_t(size_t, params->min_shift, + ilog2(HASH_MIN_SIZE)); + if (params->nelem_hint) - size = rounded_hashtable_size(params->nelem_hint); + size = rounded_hashtable_size(params); tbl = bucket_table_alloc(size, GFP_KERNEL); if (tbl == NULL) -- cgit v1.2.3-59-g8ed1b From 87fed556d08d21dd7dd3e0222c94c187e4c2d5e2 Mon Sep 17 00:00:00 2001 From: Rafał Miłecki Date: Wed, 3 Sep 2014 10:35:13 +0200 Subject: bcma: get info about flash type SoC booted from MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is an ongoing work on cleaning MIPS's nvram support so it could be re-used on other platforms (bcm53xx to say precisely). This will require a bit of extra logic in bcma this patch implements. Signed-off-by: Rafał Miłecki Signed-off-by: John W. Linville --- drivers/bcma/driver_mips.c | 62 ++++++++++++++++++++++++++++++++++++++++++ include/linux/bcma/bcma_regs.h | 5 ++++ 2 files changed, 67 insertions(+) (limited to 'include/linux') diff --git a/drivers/bcma/driver_mips.c b/drivers/bcma/driver_mips.c index 11115bbe115c..004d6aa671ce 100644 --- a/drivers/bcma/driver_mips.c +++ b/drivers/bcma/driver_mips.c @@ -21,6 +21,14 @@ #include #include +enum bcma_boot_dev { + BCMA_BOOT_DEV_UNK = 0, + BCMA_BOOT_DEV_ROM, + BCMA_BOOT_DEV_PARALLEL, + BCMA_BOOT_DEV_SERIAL, + BCMA_BOOT_DEV_NAND, +}; + static const char * const part_probes[] = { "bcm47xxpart", NULL }; static struct physmap_flash_data bcma_pflash_data = { @@ -229,11 +237,51 @@ u32 bcma_cpu_clock(struct bcma_drv_mips *mcore) } EXPORT_SYMBOL(bcma_cpu_clock); +static enum bcma_boot_dev bcma_boot_dev(struct bcma_bus *bus) +{ + struct bcma_drv_cc *cc = &bus->drv_cc; + u8 cc_rev = cc->core->id.rev; + + if (cc_rev == 42) { + struct bcma_device *core; + + core = bcma_find_core(bus, BCMA_CORE_NS_ROM); + if (core) { + switch (bcma_aread32(core, BCMA_IOST) & + BCMA_NS_ROM_IOST_BOOT_DEV_MASK) { + case BCMA_NS_ROM_IOST_BOOT_DEV_NOR: + return BCMA_BOOT_DEV_SERIAL; + case BCMA_NS_ROM_IOST_BOOT_DEV_NAND: + return BCMA_BOOT_DEV_NAND; + case BCMA_NS_ROM_IOST_BOOT_DEV_ROM: + default: + return BCMA_BOOT_DEV_ROM; + } + } + } else { + if (cc_rev == 38) { + if (cc->status & BCMA_CC_CHIPST_5357_NAND_BOOT) + return BCMA_BOOT_DEV_NAND; + else if (cc->status & BIT(5)) + return BCMA_BOOT_DEV_ROM; + } + + if ((cc->capabilities & BCMA_CC_CAP_FLASHT) == + BCMA_CC_FLASHT_PARA) + return BCMA_BOOT_DEV_PARALLEL; + else + return BCMA_BOOT_DEV_SERIAL; + } + + return BCMA_BOOT_DEV_SERIAL; +} + static void bcma_core_mips_flash_detect(struct bcma_drv_mips *mcore) { struct bcma_bus *bus = mcore->core->bus; struct bcma_drv_cc *cc = &bus->drv_cc; struct bcma_pflash *pflash = &cc->pflash; + enum bcma_boot_dev boot_dev; switch (cc->capabilities & BCMA_CC_CAP_FLASHT) { case BCMA_CC_FLASHT_STSER: @@ -269,6 +317,20 @@ static void bcma_core_mips_flash_detect(struct bcma_drv_mips *mcore) bcma_nflash_init(cc); } } + + /* Determine flash type this SoC boots from */ + boot_dev = bcma_boot_dev(bus); + switch (boot_dev) { + case BCMA_BOOT_DEV_PARALLEL: + case BCMA_BOOT_DEV_SERIAL: + /* TODO: Init NVRAM using BCMA_SOC_FLASH2 window */ + break; + case BCMA_BOOT_DEV_NAND: + /* TODO: Init NVRAM using BCMA_SOC_FLASH1 window */ + break; + default: + break; + } } void bcma_core_mips_early_init(struct bcma_drv_mips *mcore) diff --git a/include/linux/bcma/bcma_regs.h b/include/linux/bcma/bcma_regs.h index 917dcd7965e7..e64ae7bf80a1 100644 --- a/include/linux/bcma/bcma_regs.h +++ b/include/linux/bcma/bcma_regs.h @@ -39,6 +39,11 @@ #define BCMA_RESET_CTL_RESET 0x0001 #define BCMA_RESET_ST 0x0804 +#define BCMA_NS_ROM_IOST_BOOT_DEV_MASK 0x0003 +#define BCMA_NS_ROM_IOST_BOOT_DEV_NOR 0x0000 +#define BCMA_NS_ROM_IOST_BOOT_DEV_NAND 0x0001 +#define BCMA_NS_ROM_IOST_BOOT_DEV_ROM 0x0002 + /* BCMA PCI config space registers. */ #define BCMA_PCI_PMCSR 0x44 #define BCMA_PCI_PE 0x100 -- cgit v1.2.3-59-g8ed1b From a9fe8e29945d56f35235a3a0fba99b4cf181d211 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Tue, 2 Sep 2014 15:49:26 +0200 Subject: ipv4: implement igmp_qrv sysctl to tune igmp robustness variable As in IPv6 people might increase the igmp query robustness variable to make sure unsolicited state change reports aren't lost on the network. Add and document this new knob to igmp code. RFCs allow tuning this parameter back to first IGMP RFC, so we also use this setting for all counters, including source specific multicast. Also take over sysctl value when upping the interface and don't reuse the last one seen on the interface. Cc: Flavio Leitner Signed-off-by: Hannes Frederic Sowa Acked-by: Flavio Leitner Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 5 +++++ include/linux/igmp.h | 1 + net/ipv4/igmp.c | 31 +++++++++++++++---------------- net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++ 4 files changed, 31 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index cfc71ac0f764..db2383cb1df9 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -844,6 +844,11 @@ igmp_max_memberships - INTEGER conf/all/* is special, changes the settings for all interfaces +igmp_qrv - INTEGER + Controls the IGMP query robustness variable (see RFC2236 8.1). + Default: 2 (as specified by RFC2236 8.1) + Minimum: 1 (as specified by RFC6636 4.5) + log_martians - BOOLEAN Log packets with impossible addresses to kernel log. log_martians for the interface will be enabled if at least one of diff --git a/include/linux/igmp.h b/include/linux/igmp.h index f47550d75f85..2c677afeea47 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -39,6 +39,7 @@ static inline struct igmpv3_query * extern int sysctl_igmp_max_memberships; extern int sysctl_igmp_max_msf; +extern int sysctl_igmp_qrv; struct ip_sf_socklist { unsigned int sl_max; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 890c4258804c..4146153d875d 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -117,7 +117,7 @@ #define IGMP_V2_Unsolicited_Report_Interval (10*HZ) #define IGMP_V3_Unsolicited_Report_Interval (1*HZ) #define IGMP_Query_Response_Interval (10*HZ) -#define IGMP_Unsolicited_Report_Count 2 +#define IGMP_Query_Robustness_Variable 2 #define IGMP_Initial_Report_Delay (1) @@ -756,8 +756,7 @@ static void igmp_ifc_event(struct in_device *in_dev) { if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) return; - in_dev->mr_ifc_count = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + in_dev->mr_ifc_count = in_dev->mr_qrv ?: sysctl_igmp_qrv; igmp_ifc_start_timer(in_dev, 1); } @@ -1086,8 +1085,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) pmc->interface = im->interface; in_dev_hold(in_dev); pmc->multiaddr = im->multiaddr; - pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; pmc->sfmode = im->sfmode; if (pmc->sfmode == MCAST_INCLUDE) { struct ip_sf_list *psf; @@ -1226,8 +1224,7 @@ static void igmp_group_added(struct ip_mc_list *im) } /* else, v3 */ - im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + im->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; igmp_ifc_event(in_dev); #endif } @@ -1322,7 +1319,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im); - im->unsolicit_count = IGMP_Unsolicited_Report_Count; + im->unsolicit_count = sysctl_igmp_qrv; #endif im->next_rcu = in_dev->mc_list; @@ -1460,7 +1457,7 @@ void ip_mc_init_dev(struct in_device *in_dev) (unsigned long)in_dev); setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, (unsigned long)in_dev); - in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; + in_dev->mr_qrv = sysctl_igmp_qrv; #endif spin_lock_init(&in_dev->mc_tomb_lock); @@ -1474,6 +1471,9 @@ void ip_mc_up(struct in_device *in_dev) ASSERT_RTNL(); +#ifdef CONFIG_IP_MULTICAST + in_dev->mr_qrv = sysctl_igmp_qrv; +#endif ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); for_each_pmc_rtnl(in_dev, pmc) @@ -1540,7 +1540,9 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) */ int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS; int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF; - +#ifdef CONFIG_IP_MULTICAST +int sysctl_igmp_qrv __read_mostly = IGMP_Query_Robustness_Variable; +#endif static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, __be32 *psfsrc) @@ -1575,8 +1577,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, #ifdef CONFIG_IP_MULTICAST if (psf->sf_oldin && !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) { - psf->sf_crcount = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + psf->sf_crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; psf->sf_next = pmc->tomb; pmc->tomb = psf; rv = 1; @@ -1639,8 +1640,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, /* filter mode change */ pmc->sfmode = MCAST_INCLUDE; #ifdef CONFIG_IP_MULTICAST - pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; in_dev->mr_ifc_count = pmc->crcount; for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; @@ -1818,8 +1818,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, #ifdef CONFIG_IP_MULTICAST /* else no filters; keep old mode for reports */ - pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; in_dev->mr_ifc_count = pmc->crcount; for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 79a007c52558..45d156dacd61 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -450,6 +450,16 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, +#ifdef CONFIG_IP_MULTICAST + { + .procname = "igmp_qrv", + .data = &sysctl_igmp_qrv, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one + }, +#endif { .procname = "inet_peer_threshold", .data = &inet_peer_threshold, -- cgit v1.2.3-59-g8ed1b From 2740f0cf8ec8bc7ee6a58f68841759e367dda98f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 3 Sep 2014 15:24:58 +0300 Subject: cfg80211: add Intel Mobile Communications copyright Our legal structure changed at some point (see wikipedia), but we forgot to immediately switch over to the new copyright notice. For files that we have modified in the time since the change, add the proper copyright notice now. Signed-off-by: Johannes Berg Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + include/net/cfg80211.h | 1 + net/wireless/chan.c | 1 + net/wireless/core.c | 1 + net/wireless/nl80211.c | 1 + net/wireless/reg.c | 1 + net/wireless/scan.c | 1 + net/wireless/util.c | 1 + 8 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 8018c915ee63..77d4f26e0235 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -6,6 +6,7 @@ * Copyright (c) 2002-2003, Jouni Malinen * Copyright (c) 2005, Devicescape Software, Inc. * Copyright (c) 2006, Michael Wu + * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index ab21299c8f4d..6be68bb31918 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4,6 +4,7 @@ * 802.11 device and configuration interface * * Copyright 2006-2010 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 992b34070bcb..72d81e2154d5 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -4,6 +4,7 @@ * any point in time. * * Copyright 2009 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH */ #include diff --git a/net/wireless/core.c b/net/wireless/core.c index a207eb116829..9698fe709251 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -2,6 +2,7 @@ * This is the linux wireless configuration interface. * * Copyright 2006-2010 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 3011401f52c0..d876146498dd 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2,6 +2,7 @@ * This is the new netlink-based wireless configuration interface. * * Copyright 2006-2010 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH */ #include diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 1afdf45db38f..8bfc9b172a49 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2007 Johannes Berg * Copyright 2008-2011 Luis R. Rodriguez + * Copyright 2013-2014 Intel Mobile Communications GmbH * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 620a4b40d466..bda39f149810 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -2,6 +2,7 @@ * cfg80211 scan result handling * * Copyright 2008 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH */ #include #include diff --git a/net/wireless/util.c b/net/wireless/util.c index 728f1c0dc70d..a8b2816ffcb9 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -2,6 +2,7 @@ * Wireless utility functions * * Copyright 2007-2009 Johannes Berg + * Copyright 2013-2014 Intel Mobile Communications GmbH */ #include #include -- cgit v1.2.3-59-g8ed1b From 60a3b2253c413cf601783b070507d7dd6620c954 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Sep 2014 22:53:44 +0200 Subject: net: bpf: make eBPF interpreter images read-only With eBPF getting more extended and exposure to user space is on it's way, hardening the memory range the interpreter uses to steer its command flow seems appropriate. This patch moves the to be interpreted bytecode to read-only pages. In case we execute a corrupted BPF interpreter image for some reason e.g. caused by an attacker which got past a verifier stage, it would not only provide arbitrary read/write memory access but arbitrary function calls as well. After setting up the BPF interpreter image, its contents do not change until destruction time, thus we can setup the image on immutable made pages in order to mitigate modifications to that code. The idea is derived from commit 314beb9bcabf ("x86: bpf_jit_comp: secure bpf jit against spraying attacks"). This is possible because bpf_prog is not part of sk_filter anymore. After setup bpf_prog cannot be altered during its life-time. This prevents any modifications to the entire bpf_prog structure (incl. function/JIT image pointer). Every eBPF program (including classic BPF that are migrated) have to call bpf_prog_select_runtime() to select either interpreter or a JIT image as a last setup step, and they all are being freed via bpf_prog_free(), including non-JIT. Therefore, we can easily integrate this into the eBPF life-time, plus since we directly allocate a bpf_prog, we have no performance penalty. Tested with seccomp and test_bpf testsuite in JIT/non-JIT mode and manual inspection of kernel_page_tables. Brad Spengler proposed the same idea via Twitter during development of this patch. Joint work with Hannes Frederic Sowa. Suggested-by: Brad Spengler Signed-off-by: Daniel Borkmann Signed-off-by: Hannes Frederic Sowa Cc: Alexei Starovoitov Cc: Kees Cook Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/arm/net/bpf_jit_32.c | 3 +- arch/mips/net/bpf_jit.c | 3 +- arch/powerpc/net/bpf_jit_comp.c | 3 +- arch/s390/net/bpf_jit_comp.c | 2 +- arch/sparc/net/bpf_jit_comp.c | 3 +- arch/x86/net/bpf_jit_comp.c | 18 ++++------ include/linux/filter.h | 49 ++++++++++++++++++++++--- kernel/bpf/core.c | 80 +++++++++++++++++++++++++++++++++++++++-- kernel/seccomp.c | 7 ++-- lib/test_bpf.c | 2 +- net/core/filter.c | 6 ++-- 11 files changed, 144 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index a37b989a2f91..a76623bcf722 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -930,5 +930,6 @@ void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); - kfree(fp); + + bpf_prog_unlock_free(fp); } diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index 05a56619ece2..cfa83cf2447d 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -1427,5 +1427,6 @@ void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); - kfree(fp); + + bpf_prog_unlock_free(fp); } diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 3afa6f4c1957..40c53ff59124 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -697,5 +697,6 @@ void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); - kfree(fp); + + bpf_prog_unlock_free(fp); } diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 61e45b7c04d7..f2833c5b218a 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -887,5 +887,5 @@ void bpf_jit_free(struct bpf_prog *fp) module_free(NULL, header); free_filter: - kfree(fp); + bpf_prog_unlock_free(fp); } diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c index 1f76c22a6a75..f7a736b645e8 100644 --- a/arch/sparc/net/bpf_jit_comp.c +++ b/arch/sparc/net/bpf_jit_comp.c @@ -812,5 +812,6 @@ void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); - kfree(fp); + + bpf_prog_unlock_free(fp); } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index b08a98c59530..39ccfbb4a723 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -972,23 +972,17 @@ out: kfree(addrs); } -static void bpf_jit_free_deferred(struct work_struct *work) +void bpf_jit_free(struct bpf_prog *fp) { - struct bpf_prog *fp = container_of(work, struct bpf_prog, work); unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; struct bpf_binary_header *header = (void *)addr; + if (!fp->jited) + goto free_filter; + set_memory_rw(addr, header->pages); module_free(NULL, header); - kfree(fp); -} -void bpf_jit_free(struct bpf_prog *fp) -{ - if (fp->jited) { - INIT_WORK(&fp->work, bpf_jit_free_deferred); - schedule_work(&fp->work); - } else { - kfree(fp); - } +free_filter: + bpf_prog_unlock_free(fp); } diff --git a/include/linux/filter.h b/include/linux/filter.h index a5227ab8ccb1..c78994593355 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -9,6 +9,11 @@ #include #include #include +#include + +struct sk_buff; +struct sock; +struct seccomp_data; /* Internally used and optimized filter representation with extended * instruction set based on top of classic BPF. @@ -320,20 +325,23 @@ struct sock_fprog_kern { struct sock_filter *filter; }; -struct sk_buff; -struct sock; -struct seccomp_data; +struct bpf_work_struct { + struct bpf_prog *prog; + struct work_struct work; +}; struct bpf_prog { + u32 pages; /* Number of allocated pages */ u32 jited:1, /* Is our filter JIT'ed? */ len:31; /* Number of filter blocks */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ + struct bpf_work_struct *work; /* Deferred free work struct */ unsigned int (*bpf_func)(const struct sk_buff *skb, const struct bpf_insn *filter); + /* Instructions for interpreter */ union { struct sock_filter insns[0]; struct bpf_insn insnsi[0]; - struct work_struct work; }; }; @@ -353,6 +361,26 @@ static inline unsigned int bpf_prog_size(unsigned int proglen) #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) +#ifdef CONFIG_DEBUG_SET_MODULE_RONX +static inline void bpf_prog_lock_ro(struct bpf_prog *fp) +{ + set_memory_ro((unsigned long)fp, fp->pages); +} + +static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) +{ + set_memory_rw((unsigned long)fp, fp->pages); +} +#else +static inline void bpf_prog_lock_ro(struct bpf_prog *fp) +{ +} + +static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) +{ +} +#endif /* CONFIG_DEBUG_SET_MODULE_RONX */ + int sk_filter(struct sock *sk, struct sk_buff *skb); void bpf_prog_select_runtime(struct bpf_prog *fp); @@ -361,6 +389,17 @@ void bpf_prog_free(struct bpf_prog *fp); int bpf_convert_filter(struct sock_filter *prog, int len, struct bpf_insn *new_prog, int *new_len); +struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); +struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, + gfp_t gfp_extra_flags); +void __bpf_prog_free(struct bpf_prog *fp); + +static inline void bpf_prog_unlock_free(struct bpf_prog *fp) +{ + bpf_prog_unlock_ro(fp); + __bpf_prog_free(fp); +} + int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); void bpf_prog_destroy(struct bpf_prog *fp); @@ -450,7 +489,7 @@ static inline void bpf_jit_compile(struct bpf_prog *fp) static inline void bpf_jit_free(struct bpf_prog *fp) { - kfree(fp); + bpf_prog_unlock_free(fp); } #endif /* CONFIG_BPF_JIT */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7f0dbcbb34af..b54bb2c2e494 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -22,6 +22,7 @@ */ #include #include +#include #include /* Registers */ @@ -63,6 +64,67 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns return NULL; } +struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | + gfp_extra_flags; + struct bpf_work_struct *ws; + struct bpf_prog *fp; + + size = round_up(size, PAGE_SIZE); + fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + if (fp == NULL) + return NULL; + + ws = kmalloc(sizeof(*ws), GFP_KERNEL | gfp_extra_flags); + if (ws == NULL) { + vfree(fp); + return NULL; + } + + fp->pages = size / PAGE_SIZE; + fp->work = ws; + + return fp; +} +EXPORT_SYMBOL_GPL(bpf_prog_alloc); + +struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, + gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | + gfp_extra_flags; + struct bpf_prog *fp; + + BUG_ON(fp_old == NULL); + + size = round_up(size, PAGE_SIZE); + if (size <= fp_old->pages * PAGE_SIZE) + return fp_old; + + fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + if (fp != NULL) { + memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); + fp->pages = size / PAGE_SIZE; + + /* We keep fp->work from fp_old around in the new + * reallocated structure. + */ + fp_old->work = NULL; + __bpf_prog_free(fp_old); + } + + return fp; +} +EXPORT_SYMBOL_GPL(bpf_prog_realloc); + +void __bpf_prog_free(struct bpf_prog *fp) +{ + kfree(fp->work); + vfree(fp); +} +EXPORT_SYMBOL_GPL(__bpf_prog_free); + /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs * anyway later on, so do not let the compiler omit it. @@ -523,12 +585,26 @@ void bpf_prog_select_runtime(struct bpf_prog *fp) /* Probe if internal BPF can be JITed */ bpf_int_jit_compile(fp); + /* Lock whole bpf_prog as read-only */ + bpf_prog_lock_ro(fp); } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); -/* free internal BPF program */ +static void bpf_prog_free_deferred(struct work_struct *work) +{ + struct bpf_work_struct *ws; + + ws = container_of(work, struct bpf_work_struct, work); + bpf_jit_free(ws->prog); +} + +/* Free internal BPF program */ void bpf_prog_free(struct bpf_prog *fp) { - bpf_jit_free(fp); + struct bpf_work_struct *ws = fp->work; + + INIT_WORK(&ws->work, bpf_prog_free_deferred); + ws->prog = fp; + schedule_work(&ws->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 44eb005c6695..84922befea84 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -395,16 +395,15 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) if (!filter) goto free_prog; - filter->prog = kzalloc(bpf_prog_size(new_len), - GFP_KERNEL|__GFP_NOWARN); + filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN); if (!filter->prog) goto free_filter; ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); if (ret) goto free_filter_prog; - kfree(fp); + kfree(fp); atomic_set(&filter->usage, 1); filter->prog->len = new_len; @@ -413,7 +412,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) return filter; free_filter_prog: - kfree(filter->prog); + __bpf_prog_free(filter->prog); free_filter: kfree(filter); free_prog: diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 8c66c6aace04..9a67456ba29a 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -1836,7 +1836,7 @@ static struct bpf_prog *generate_filter(int which, int *err) break; case INTERNAL: - fp = kzalloc(bpf_prog_size(flen), GFP_KERNEL); + fp = bpf_prog_alloc(bpf_prog_size(flen), 0); if (fp == NULL) { pr_cont("UNEXPECTED_FAIL no memory left\n"); *err = -ENOMEM; diff --git a/net/core/filter.c b/net/core/filter.c index d814b8a89d0f..37f8eb06fdee 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -933,7 +933,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) /* Expand fp for appending the new filter representation. */ old_fp = fp; - fp = krealloc(old_fp, bpf_prog_size(new_len), GFP_KERNEL); + fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); if (!fp) { /* The old_fp is still around in case we couldn't * allocate new memory, so uncharge on that one. @@ -1013,7 +1013,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) if (fprog->filter == NULL) return -EINVAL; - fp = kmalloc(bpf_prog_size(fprog->len), GFP_KERNEL); + fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!fp) return -ENOMEM; @@ -1069,7 +1069,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (fprog->filter == NULL) return -EINVAL; - prog = kmalloc(bpf_fsize, GFP_KERNEL); + prog = bpf_prog_alloc(bpf_fsize, 0); if (!prog) return -ENOMEM; -- cgit v1.2.3-59-g8ed1b From f0db9b073415848709dd59a6394969882f517da9 Mon Sep 17 00:00:00 2001 From: Govindarajulu Varadarajan <_govind@gmx.com> Date: Wed, 3 Sep 2014 03:17:20 +0530 Subject: ethtool: Add generic options for tunables This patch adds new ethtool cmd, ETHTOOL_GTUNABLE & ETHTOOL_STUNABLE for getting tunable values from driver. Add get_tunable and set_tunable to ethtool_ops. Driver implements these functions for getting/setting tunable value. Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com> Signed-off-by: David S. Miller --- include/linux/ethtool.h | 4 +++ include/uapi/linux/ethtool.h | 28 +++++++++++++++ net/core/ethtool.c | 81 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index e658229fee39..c1a2d60dfb82 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -257,6 +257,10 @@ struct ethtool_ops { struct ethtool_eeprom *, u8 *); int (*get_eee)(struct net_device *, struct ethtool_eee *); int (*set_eee)(struct net_device *, struct ethtool_eee *); + int (*get_tunable)(struct net_device *, + const struct ethtool_tunable *, void *); + int (*set_tunable)(struct net_device *, + const struct ethtool_tunable *, const void *); }; diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index e3c7a719c76b..7a364f2f3d3f 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -209,6 +209,32 @@ struct ethtool_value { __u32 data; }; +enum tunable_id { + ETHTOOL_ID_UNSPEC, + ETHTOOL_RX_COPYBREAK, +}; + +enum tunable_type_id { + ETHTOOL_TUNABLE_UNSPEC, + ETHTOOL_TUNABLE_U8, + ETHTOOL_TUNABLE_U16, + ETHTOOL_TUNABLE_U32, + ETHTOOL_TUNABLE_U64, + ETHTOOL_TUNABLE_STRING, + ETHTOOL_TUNABLE_S8, + ETHTOOL_TUNABLE_S16, + ETHTOOL_TUNABLE_S32, + ETHTOOL_TUNABLE_S64, +}; + +struct ethtool_tunable { + __u32 cmd; + __u32 id; + __u32 type_id; + __u32 len; + void *data[0]; +}; + /** * struct ethtool_regs - hardware register dump * @cmd: Command number = %ETHTOOL_GREGS @@ -1152,6 +1178,8 @@ enum ethtool_sfeatures_retval_bits { #define ETHTOOL_GRSSH 0x00000046 /* Get RX flow hash configuration */ #define ETHTOOL_SRSSH 0x00000047 /* Set RX flow hash configuration */ +#define ETHTOOL_GTUNABLE 0x00000048 /* Get tunable configuration */ +#define ETHTOOL_STUNABLE 0x00000049 /* Set tunable configuration */ /* compatibility with older code */ #define SPARC_ETH_GSET ETHTOOL_GSET diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 17cb912793fa..27e61b886520 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1621,6 +1621,80 @@ static int ethtool_get_module_eeprom(struct net_device *dev, modinfo.eeprom_len); } +static int ethtool_tunable_valid(const struct ethtool_tunable *tuna) +{ + switch (tuna->id) { + case ETHTOOL_RX_COPYBREAK: + if (tuna->len != sizeof(u32) || + tuna->type_id != ETHTOOL_TUNABLE_U32) + return -EINVAL; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr) +{ + int ret; + struct ethtool_tunable tuna; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *data; + + if (!ops->get_tunable) + return -EOPNOTSUPP; + if (copy_from_user(&tuna, useraddr, sizeof(tuna))) + return -EFAULT; + ret = ethtool_tunable_valid(&tuna); + if (ret) + return ret; + data = kmalloc(tuna.len, GFP_USER); + if (!data) + return -ENOMEM; + ret = ops->get_tunable(dev, &tuna, data); + if (ret) + goto out; + useraddr += sizeof(tuna); + ret = -EFAULT; + if (copy_to_user(useraddr, data, tuna.len)) + goto out; + ret = 0; + +out: + kfree(data); + return ret; +} + +static int ethtool_set_tunable(struct net_device *dev, void __user *useraddr) +{ + int ret; + struct ethtool_tunable tuna; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *data; + + if (!ops->set_tunable) + return -EOPNOTSUPP; + if (copy_from_user(&tuna, useraddr, sizeof(tuna))) + return -EFAULT; + ret = ethtool_tunable_valid(&tuna); + if (ret) + return ret; + data = kmalloc(tuna.len, GFP_USER); + if (!data) + return -ENOMEM; + useraddr += sizeof(tuna); + ret = -EFAULT; + if (copy_from_user(data, useraddr, tuna.len)) + goto out; + ret = ops->set_tunable(dev, &tuna, data); + +out: + kfree(data); + return ret; +} + /* The main entry point in this file. Called from net/core/dev_ioctl.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) @@ -1670,6 +1744,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GCHANNELS: case ETHTOOL_GET_TS_INFO: case ETHTOOL_GEEE: + case ETHTOOL_GTUNABLE: break; default: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) @@ -1857,6 +1932,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GMODULEEEPROM: rc = ethtool_get_module_eeprom(dev, useraddr); break; + case ETHTOOL_GTUNABLE: + rc = ethtool_get_tunable(dev, useraddr); + break; + case ETHTOOL_STUNABLE: + rc = ethtool_set_tunable(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } -- cgit v1.2.3-59-g8ed1b From 62bccb8cdb69051b95a55ab0c489e3cab261c8ef Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 4 Sep 2014 13:31:35 -0400 Subject: net-timestamp: Make the clone operation stand-alone from phy timestamping The phy timestamping takes a different path than the regular timestamping does in that it will create a clone first so that the packets needing to be timestamped can be placed in a queue, or the context block could be used. In order to support these use cases I am pulling the core of the code out so it can be used in other drivers beyond just phy devices. In addition I have added a destructor named sock_efree which is meant to provide a simple way for dropping the reference to skb exceptions that aren't part of either the receive or send windows for the socket, and I have removed some duplication in spots where this destructor could be used in place of sock_edemux. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- drivers/net/phy/dp83640.c | 6 +++--- include/linux/skbuff.h | 2 ++ include/net/sock.h | 1 + net/core/skbuff.c | 32 +++++++++++++++++++++++++------- net/core/sock.c | 6 ++++++ net/core/timestamping.c | 14 +++----------- 6 files changed, 40 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c index d5991ac46ab0..87648b306551 100644 --- a/drivers/net/phy/dp83640.c +++ b/drivers/net/phy/dp83640.c @@ -1148,7 +1148,7 @@ static void dp83640_remove(struct phy_device *phydev) kfree_skb(skb); while ((skb = skb_dequeue(&dp83640->tx_queue)) != NULL) - skb_complete_tx_timestamp(skb, NULL); + kfree_skb(skb); clock = dp83640_clock_get(dp83640->clock); @@ -1405,7 +1405,7 @@ static void dp83640_txtstamp(struct phy_device *phydev, case HWTSTAMP_TX_ONESTEP_SYNC: if (is_sync(skb, type)) { - skb_complete_tx_timestamp(skb, NULL); + kfree_skb(skb); return; } /* fall through */ @@ -1416,7 +1416,7 @@ static void dp83640_txtstamp(struct phy_device *phydev, case HWTSTAMP_TX_OFF: default: - skb_complete_tx_timestamp(skb, NULL); + kfree_skb(skb); break; } } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 02529fcad1ac..1cf0cfaef10a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2690,6 +2690,8 @@ static inline ktime_t net_invalid_timestamp(void) return ktime_set(0, 0); } +struct sk_buff *skb_clone_sk(struct sk_buff *skb); + #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING void skb_clone_tx_timestamp(struct sk_buff *skb); diff --git a/include/net/sock.h b/include/net/sock.h index 3fde6130789d..e02be37a3d91 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1574,6 +1574,7 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, void sock_wfree(struct sk_buff *skb); void skb_orphan_partial(struct sk_buff *skb); void sock_rfree(struct sk_buff *skb); +void sock_efree(struct sk_buff *skb); void sock_edemux(struct sk_buff *skb); int sock_setsockopt(struct socket *sock, int level, int op, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 697e696c914b..a936a401564e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3511,6 +3511,27 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk) } EXPORT_SYMBOL(sock_dequeue_err_skb); +struct sk_buff *skb_clone_sk(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct sk_buff *clone; + + if (!sk || !atomic_inc_not_zero(&sk->sk_refcnt)) + return NULL; + + clone = skb_clone(skb, GFP_ATOMIC); + if (!clone) { + sock_put(sk); + return NULL; + } + + clone->sk = sk; + clone->destructor = sock_efree; + + return clone; +} +EXPORT_SYMBOL(skb_clone_sk); + static void __skb_complete_tx_timestamp(struct sk_buff *skb, struct sock *sk, int tstype) @@ -3540,14 +3561,11 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, { struct sock *sk = skb->sk; - skb->sk = NULL; + /* take a reference to prevent skb_orphan() from freeing the socket */ + sock_hold(sk); - if (hwtstamps) { - *skb_hwtstamps(skb) = *hwtstamps; - __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); - } else { - kfree_skb(skb); - } + *skb_hwtstamps(skb) = *hwtstamps; + __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); sock_put(sk); } diff --git a/net/core/sock.c b/net/core/sock.c index f1a638ee93d9..d04005c51724 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1637,6 +1637,12 @@ void sock_rfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_rfree); +void sock_efree(struct sk_buff *skb) +{ + sock_put(skb->sk); +} +EXPORT_SYMBOL(sock_efree); + void sock_edemux(struct sk_buff *skb) { struct sock *sk = skb->sk; diff --git a/net/core/timestamping.c b/net/core/timestamping.c index f48a59fd8f39..43d3dd62fcc8 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -36,10 +36,9 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) { struct phy_device *phydev; struct sk_buff *clone; - struct sock *sk = skb->sk; unsigned int type; - if (!sk) + if (!skb->sk) return; type = classify(skb); @@ -48,16 +47,9 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) phydev = skb->dev->phydev; if (likely(phydev->drv->txtstamp)) { - if (!atomic_inc_not_zero(&sk->sk_refcnt)) + clone = skb_clone_sk(skb); + if (!clone) return; - - clone = skb_clone(skb, GFP_ATOMIC); - if (!clone) { - sock_put(sk); - return; - } - - clone->sk = sk; phydev->drv->txtstamp(phydev, clone, type); } } -- cgit v1.2.3-59-g8ed1b From 56193d1bce2b2759cb4bdcc00cd05544894a0c90 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 5 Sep 2014 19:20:26 -0400 Subject: net: Add function for parsing the header length out of linear ethernet frames This patch updates some of the flow_dissector api so that it can be used to parse the length of ethernet buffers stored in fragments. Most of the changes needed were to __skb_get_poff as it needed to be updated to support sending a linear buffer instead of a skb. I have split __skb_get_poff into two functions, the first is skb_get_poff and it retains the functionality of the original __skb_get_poff. The other function is __skb_get_poff which now works much like __skb_flow_dissect in relation to skb_flow_dissect in that it provides the same functionality but works with just a data buffer and hlen instead of needing an skb. Signed-off-by: Alexander Duyck Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 1 + include/linux/skbuff.h | 4 +++- include/net/flow_keys.h | 2 ++ net/core/filter.c | 2 +- net/core/flow_dissector.c | 46 +++++++++++++++++++++++++++++++-------------- net/ethernet/eth.c | 27 ++++++++++++++++++++++++++ 6 files changed, 66 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 9c5529dc6d07..733980fce8e3 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -29,6 +29,7 @@ #include #ifdef __KERNEL__ +u32 eth_get_headlen(void *data, unsigned int max_len); __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); extern const struct header_ops eth_header_ops; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1cf0cfaef10a..07c9fdd0c126 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3218,7 +3218,9 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off); int skb_checksum_setup(struct sk_buff *skb, bool recalculate); -u32 __skb_get_poff(const struct sk_buff *skb); +u32 skb_get_poff(const struct sk_buff *skb); +u32 __skb_get_poff(const struct sk_buff *skb, void *data, + const struct flow_keys *keys, int hlen); /** * skb_head_is_locked - Determine if the skb->head is locked down diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h index 9a03f73c4974..7ee2df083542 100644 --- a/include/net/flow_keys.h +++ b/include/net/flow_keys.h @@ -40,4 +40,6 @@ static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0); } u32 flow_hash_from_keys(struct flow_keys *keys); +unsigned int flow_get_hlen(const unsigned char *data, unsigned int max_len, + __be16 protocol); #endif diff --git a/net/core/filter.c b/net/core/filter.c index 37f8eb06fdee..fa5b7d0f77ac 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -113,7 +113,7 @@ static unsigned int pkt_type_offset(void) static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) { - return __skb_get_poff((struct sk_buff *)(unsigned long) ctx); + return skb_get_poff((struct sk_buff *)(unsigned long) ctx); } static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 12f48ca7a0b0..8560dea58803 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -13,6 +13,7 @@ #include #include #include +#include /* copy saddr & daddr, possibly using 64bit load/store * Equivalent to : flow->src = iph->saddr; @@ -117,6 +118,13 @@ ipv6: flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); nhoff += sizeof(struct ipv6hdr); + /* skip the flow label processing if skb is NULL. The + * assumption here is that if there is no skb we are not + * looking for flow info as much as we are length. + */ + if (!skb) + break; + flow_label = ip6_flowlabel(iph); if (flow_label) { /* Awesome, IPv6 packet has a flow label so we can @@ -165,6 +173,9 @@ ipv6: return false; } } + case htons(ETH_P_FCOE): + flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN); + /* fall through */ default: return false; } @@ -316,26 +327,18 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, } EXPORT_SYMBOL(__skb_tx_hash); -/* __skb_get_poff() returns the offset to the payload as far as it could - * be dissected. The main user is currently BPF, so that we can dynamically - * truncate packets without needing to push actual payload to the user - * space and can analyze headers only, instead. - */ -u32 __skb_get_poff(const struct sk_buff *skb) +u32 __skb_get_poff(const struct sk_buff *skb, void *data, + const struct flow_keys *keys, int hlen) { - struct flow_keys keys; - u32 poff = 0; + u32 poff = keys->thoff; - if (!skb_flow_dissect(skb, &keys)) - return 0; - - poff += keys.thoff; - switch (keys.ip_proto) { + switch (keys->ip_proto) { case IPPROTO_TCP: { const struct tcphdr *tcph; struct tcphdr _tcph; - tcph = skb_header_pointer(skb, poff, sizeof(_tcph), &_tcph); + tcph = __skb_header_pointer(skb, poff, sizeof(_tcph), + data, hlen, &_tcph); if (!tcph) return poff; @@ -369,6 +372,21 @@ u32 __skb_get_poff(const struct sk_buff *skb) return poff; } +/* skb_get_poff() returns the offset to the payload as far as it could + * be dissected. The main user is currently BPF, so that we can dynamically + * truncate packets without needing to push actual payload to the user + * space and can analyze headers only, instead. + */ +u32 skb_get_poff(const struct sk_buff *skb) +{ + struct flow_keys keys; + + if (!skb_flow_dissect(skb, &keys)) + return 0; + + return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); +} + static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_XPS diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 5cebca134585..33a140e15834 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -145,6 +145,33 @@ int eth_rebuild_header(struct sk_buff *skb) } EXPORT_SYMBOL(eth_rebuild_header); +/** + * eth_get_headlen - determine the the length of header for an ethernet frame + * @data: pointer to start of frame + * @len: total length of frame + * + * Make a best effort attempt to pull the length for all of the headers for + * a given frame in a linear buffer. + */ +u32 eth_get_headlen(void *data, unsigned int len) +{ + const struct ethhdr *eth = (const struct ethhdr *)data; + struct flow_keys keys; + + /* this should never happen, but better safe than sorry */ + if (len < sizeof(*eth)) + return len; + + /* parse any remaining L2/L3 headers, check for L4 */ + if (!__skb_flow_dissect(NULL, &keys, data, + eth->h_proto, sizeof(*eth), len)) + return max_t(u32, keys.thoff, sizeof(*eth)); + + /* parse for any L4 headers */ + return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len); +} +EXPORT_SYMBOL(eth_get_headlen); + /** * eth_type_trans - determine the packet's protocol ID. * @skb: received socket data -- cgit v1.2.3-59-g8ed1b From c8d6591752e96c550cb98b781326d72d8eedcc79 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Wed, 3 Sep 2014 06:48:37 -0700 Subject: mac80211: support DTPC IE (from Cisco Client eXtensions) Linux already supports 802.11h, where the access point can tell the client to reduce its transmission power. However, 802.11h is only defined for 5 GHz, where the need for this is much smaller than on 2.4 GHz. Cisco has their own solution, called DTPC (Dynamic Transmit Power Control). Cisco APs on a controller sometimes but not always send 802.11h; they always send DTPC, even on 2.4 GHz. This patch adds support for parsing and honoring the DTPC IE in addition to the 802.11h element (they do not always contain the same limits, so both must be honored); the format is not documented, but very simple. Tested (on top of wireless.git and on 3.16.1) against a Cisco Aironet 1142 joined to a Cisco 2504 WLC, by setting various transmit power levels for the given access points and observing the results. The Wireshark 802.11 dissector agrees with the interpretation of the element, except for negative numbers, which seem to never happen anyway. Signed-off-by: Steinar H. Gunderson Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 3 ++- net/mac80211/ieee80211_i.h | 1 + net/mac80211/mlme.c | 60 +++++++++++++++++++++++++++++++++++++--------- net/mac80211/util.c | 25 +++++++++++++++++++ 4 files changed, 77 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 77d4f26e0235..61a09f0644aa 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1824,7 +1824,8 @@ enum ieee80211_eid { WLAN_EID_DMG_TSPEC = 146, WLAN_EID_DMG_AT = 147, WLAN_EID_DMG_CAP = 148, - /* 149-150 reserved for Cisco */ + /* 149 reserved for Cisco */ + WLAN_EID_CISCO_VENDOR_SPECIFIC = 150, WLAN_EID_DMG_OPERATION = 151, WLAN_EID_DMG_BSS_PARAM_CHANGE = 152, WLAN_EID_DMG_BEAM_REFINEMENT = 153, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 6b7bdd8fae29..c2aaec4dfcf0 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1369,6 +1369,7 @@ struct ieee802_11_elems { const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie; const u8 *country_elem; const u8 *pwr_constr_elem; + const u8 *cisco_dtpc_elem; const struct ieee80211_timeout_interval_ie *timeout_int; const u8 *opmode_notif; const struct ieee80211_sec_chan_offs_ie *sec_chan_offs; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 0b812a88f95f..a7b92f5f7161 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1230,14 +1230,30 @@ ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata, return have_chan_pwr; } +static void ieee80211_find_cisco_dtpc(struct ieee80211_sub_if_data *sdata, + struct ieee80211_channel *channel, + const u8 *cisco_dtpc_ie, + int *pwr_level) +{ + /* From practical testing, the first data byte of the DTPC element + * seems to contain the requested dBm level, and the CLI on Cisco + * APs clearly state the range is -127 to 127 dBm, which indicates + * a signed byte, although it seemingly never actually goes negative. + * The other byte seems to always be zero. + */ + *pwr_level = (__s8)cisco_dtpc_ie[4]; +} + static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, struct ieee80211_mgmt *mgmt, const u8 *country_ie, u8 country_ie_len, - const u8 *pwr_constr_ie) + const u8 *pwr_constr_ie, + const u8 *cisco_dtpc_ie) { - bool has_80211h_pwr = false; - int chan_pwr, pwr_reduction_80211h; + bool has_80211h_pwr = false, has_cisco_pwr = false; + int chan_pwr = 0, pwr_reduction_80211h = 0; + int pwr_level_cisco, pwr_level_80211h; int new_ap_level; if (country_ie && pwr_constr_ie && @@ -1246,16 +1262,35 @@ static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata, has_80211h_pwr = ieee80211_find_80211h_pwr_constr( sdata, channel, country_ie, country_ie_len, pwr_constr_ie, &chan_pwr, &pwr_reduction_80211h); - new_ap_level = max_t(int, 0, chan_pwr - pwr_reduction_80211h); + pwr_level_80211h = + max_t(int, 0, chan_pwr - pwr_reduction_80211h); + } + + if (cisco_dtpc_ie) { + ieee80211_find_cisco_dtpc( + sdata, channel, cisco_dtpc_ie, &pwr_level_cisco); + has_cisco_pwr = true; } - if (!has_80211h_pwr) + if (!has_80211h_pwr && !has_cisco_pwr) return 0; - sdata_info(sdata, - "Limiting TX power to %d (%d - %d) dBm as advertised by %pM\n", - new_ap_level, chan_pwr, pwr_reduction_80211h, - sdata->u.mgd.bssid); + /* If we have both 802.11h and Cisco DTPC, apply both limits + * by picking the smallest of the two power levels advertised. + */ + if (has_80211h_pwr && + (!has_cisco_pwr || pwr_level_80211h <= pwr_level_cisco)) { + sdata_info(sdata, + "Limiting TX power to %d (%d - %d) dBm as advertised by %pM\n", + pwr_level_80211h, chan_pwr, pwr_reduction_80211h, + sdata->u.mgd.bssid); + new_ap_level = pwr_level_80211h; + } else { /* has_cisco_pwr is always true here. */ + sdata_info(sdata, + "Limiting TX power to %d dBm as advertised by %pM\n", + pwr_level_cisco, sdata->u.mgd.bssid); + new_ap_level = pwr_level_cisco; + } if (sdata->ap_power_level == new_ap_level) return 0; @@ -2923,7 +2958,9 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata, /* * This is the canonical list of information elements we care about, * the filter code also gives us all changes to the Microsoft OUI - * (00:50:F2) vendor IE which is used for WMM which we need to track. + * (00:50:F2) vendor IE which is used for WMM which we need to track, + * as well as the DTPC IE (part of the Cisco OUI) used for signaling + * changes to requested client power. * * We implement beacon filtering in software since that means we can * avoid processing the frame here and in cfg80211, and userspace @@ -3232,7 +3269,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, changed |= ieee80211_handle_pwr_constr(sdata, chan, mgmt, elems.country_elem, elems.country_elem_len, - elems.pwr_constr_elem); + elems.pwr_constr_elem, + elems.cisco_dtpc_elem); ieee80211_bss_info_change_notify(sdata, changed); } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index b444f27a788e..3c61060a4d2b 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1015,6 +1015,31 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, } elems->pwr_constr_elem = pos; break; + case WLAN_EID_CISCO_VENDOR_SPECIFIC: + /* Lots of different options exist, but we only care + * about the Dynamic Transmit Power Control element. + * First check for the Cisco OUI, then for the DTPC + * tag (0x00). + */ + if (elen < 4) { + elem_parse_failed = true; + break; + } + + if (pos[0] != 0x00 || pos[1] != 0x40 || + pos[2] != 0x96 || pos[3] != 0x00) + break; + + if (elen != 6) { + elem_parse_failed = true; + break; + } + + if (calc_crc) + crc = crc32_be(crc, pos - 2, elen + 2); + + elems->cisco_dtpc_elem = pos; + break; case WLAN_EID_TIMEOUT_INTERVAL: if (elen >= sizeof(struct ieee80211_timeout_interval_ie)) elems->timeout_int = (void *)pos; -- cgit v1.2.3-59-g8ed1b From 02ab695bb37ee9ad515df0d0790d5977505dd04a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 4 Sep 2014 22:17:17 -0700 Subject: net: filter: add "load 64-bit immediate" eBPF instruction add BPF_LD_IMM64 instruction to load 64-bit immediate value into a register. All previous instructions were 8-byte. This is first 16-byte instruction. Two consecutive 'struct bpf_insn' blocks are interpreted as single instruction: insn[0].code = BPF_LD | BPF_DW | BPF_IMM insn[0].dst_reg = destination register insn[0].imm = lower 32-bit insn[1].code = 0 insn[1].imm = upper 32-bit All unused fields must be zero. Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads 32-bit immediate value into a register. x64 JITs it as single 'movabsq %rax, imm64' arm64 may JIT as sequence of four 'movk x0, #imm16, lsl #shift' insn Note that old eBPF programs are binary compatible with new interpreter. It helps eBPF programs load 64-bit constant into a register with one instruction instead of using two registers and 4 instructions: BPF_MOV32_IMM(R1, imm32) BPF_ALU64_IMM(BPF_LSH, R1, 32) BPF_MOV32_IMM(R2, imm32) BPF_ALU64_REG(BPF_OR, R1, R2) User space generated programs will use this instruction to load constants only. To tell kernel that user space needs a pointer the _pseudo_ variant of this instruction may be added later, which will use extra bits of encoding to indicate what type of pointer user space is asking kernel to provide. For example 'off' or 'src_reg' fields can be used for such purpose. src_reg = 1 could mean that user space is asking kernel to validate and load in-kernel map pointer. src_reg = 2 could mean that user space needs readonly data section pointer src_reg = 3 could mean that user space needs a pointer to per-cpu local data All such future pseudo instructions will not be carrying the actual pointer as part of the instruction, but rather will be treated as a request to kernel to provide one. The kernel will verify the request_for_a_pointer, then will drop _pseudo_ marking and will store actual internal pointer inside the instruction, so the end result is the interpreter and JITs never see pseudo BPF_LD_IMM64 insns and only operate on generic BPF_LD_IMM64 that loads 64-bit immediate into a register. User space never operates on direct pointers and verifier can easily recognize request_for_pointer vs other instructions. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- Documentation/networking/filter.txt | 8 +++++++- arch/x86/net/bpf_jit_comp.c | 17 +++++++++++++++++ include/linux/filter.h | 18 ++++++++++++++++++ kernel/bpf/core.c | 5 +++++ lib/test_bpf.c | 21 +++++++++++++++++++++ 5 files changed, 68 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index c48a9704bda8..81916ab5d96f 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -951,7 +951,7 @@ Size modifier is one of ... Mode modifier is one of: - BPF_IMM 0x00 /* classic BPF only, reserved in eBPF */ + BPF_IMM 0x00 /* used for 32-bit mov in classic BPF and 64-bit in eBPF */ BPF_ABS 0x20 BPF_IND 0x40 BPF_MEM 0x60 @@ -995,6 +995,12 @@ BPF_XADD | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW. Note that 1 and 2 byte atomic increments are not supported. +eBPF has one 16-byte instruction: BPF_LD | BPF_DW | BPF_IMM which consists +of two consecutive 'struct bpf_insn' 8-byte blocks and interpreted as single +instruction that loads 64-bit immediate value into a dst_reg. +Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads +32-bit immediate value into a register. + Testing ------- diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 39ccfbb4a723..06f8c17f5484 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -393,6 +393,23 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT1_off32(add_1reg(0xB8, dst_reg), imm32); break; + case BPF_LD | BPF_IMM | BPF_DW: + if (insn[1].code != 0 || insn[1].src_reg != 0 || + insn[1].dst_reg != 0 || insn[1].off != 0) { + /* verifier must catch invalid insns */ + pr_err("invalid BPF_LD_IMM64 insn\n"); + return -EINVAL; + } + + /* movabsq %rax, imm64 */ + EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg)); + EMIT(insn[0].imm, 4); + EMIT(insn[1].imm, 4); + + insn++; + i++; + break; + /* dst %= src, dst /= src, dst %= imm32, dst /= imm32 */ case BPF_ALU | BPF_MOD | BPF_X: case BPF_ALU | BPF_DIV | BPF_X: diff --git a/include/linux/filter.h b/include/linux/filter.h index c78994593355..bf323da77950 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -166,6 +166,24 @@ enum { .off = 0, \ .imm = IMM }) +/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ +#define BPF_LD_IMM64(DST, IMM) \ + BPF_LD_IMM64_RAW(DST, 0, IMM) + +#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_DW | BPF_IMM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = (__u32) (IMM) }), \ + ((struct bpf_insn) { \ + .code = 0, /* zero is reserved opcode */ \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((__u64) (IMM)) >> 32 }) + /* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */ #define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b54bb2c2e494..2c2bfaacce66 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -242,6 +242,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, + [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, }; void *ptr; int off; @@ -301,6 +302,10 @@ select_insn: ALU64_MOV_K: DST = IMM; CONT; + LD_IMM_DW: + DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32; + insn++; + CONT; ALU64_ARSH_X: (*(s64 *) &DST) >>= SRC; CONT; diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 9a67456ba29a..413890815d3e 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -1735,6 +1735,27 @@ static struct bpf_test tests[] = { { }, { { 1, 0 } }, }, + { + "load 64-bit immediate", + .u.insns_int = { + BPF_LD_IMM64(R1, 0x567800001234L), + BPF_MOV64_REG(R2, R1), + BPF_MOV64_REG(R3, R2), + BPF_ALU64_IMM(BPF_RSH, R2, 32), + BPF_ALU64_IMM(BPF_LSH, R3, 32), + BPF_ALU64_IMM(BPF_RSH, R3, 32), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_JMP_IMM(BPF_JEQ, R2, 0x5678, 1), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JEQ, R3, 0x1234, 1), + BPF_EXIT_INSN(), + BPF_ALU64_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } } + }, }; static struct net_device dev; -- cgit v1.2.3-59-g8ed1b From daedfb22451dd02b35c0549566cbb7cc06bdd53b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 4 Sep 2014 22:17:18 -0700 Subject: net: filter: split filter.h and expose eBPF to user space allow user space to generate eBPF programs uapi/linux/bpf.h: eBPF instruction set definition linux/filter.h: the rest This patch only moves macro definitions, but practically it freezes existing eBPF instruction set, though new instructions can still be added in the future. These eBPF definitions cannot go into uapi/linux/filter.h, since the names may conflict with existing applications. Full eBPF ISA description is in Documentation/networking/filter.txt Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/filter.h | 56 +--------------------------------------- include/uapi/linux/Kbuild | 1 + include/uapi/linux/bpf.h | 65 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 55 deletions(-) create mode 100644 include/uapi/linux/bpf.h (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index bf323da77950..8f82ef3f1cdd 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -10,58 +10,12 @@ #include #include #include +#include struct sk_buff; struct sock; struct seccomp_data; -/* Internally used and optimized filter representation with extended - * instruction set based on top of classic BPF. - */ - -/* instruction classes */ -#define BPF_ALU64 0x07 /* alu mode in double word width */ - -/* ld/ldx fields */ -#define BPF_DW 0x18 /* double word */ -#define BPF_XADD 0xc0 /* exclusive add */ - -/* alu/jmp fields */ -#define BPF_MOV 0xb0 /* mov reg to reg */ -#define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */ - -/* change endianness of a register */ -#define BPF_END 0xd0 /* flags for endianness conversion: */ -#define BPF_TO_LE 0x00 /* convert to little-endian */ -#define BPF_TO_BE 0x08 /* convert to big-endian */ -#define BPF_FROM_LE BPF_TO_LE -#define BPF_FROM_BE BPF_TO_BE - -#define BPF_JNE 0x50 /* jump != */ -#define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ -#define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ -#define BPF_CALL 0x80 /* function call */ -#define BPF_EXIT 0x90 /* function return */ - -/* Register numbers */ -enum { - BPF_REG_0 = 0, - BPF_REG_1, - BPF_REG_2, - BPF_REG_3, - BPF_REG_4, - BPF_REG_5, - BPF_REG_6, - BPF_REG_7, - BPF_REG_8, - BPF_REG_9, - BPF_REG_10, - __MAX_BPF_REG, -}; - -/* BPF has 10 general purpose 64-bit registers and stack frame. */ -#define MAX_BPF_REG __MAX_BPF_REG - /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function * calls in BPF_CALL instruction. @@ -322,14 +276,6 @@ enum { #define SK_RUN_FILTER(filter, ctx) \ (*filter->prog->bpf_func)(ctx, filter->prog->insnsi) -struct bpf_insn { - __u8 code; /* opcode */ - __u8 dst_reg:4; /* dest register */ - __u8 src_reg:4; /* source register */ - __s16 off; /* signed offset */ - __s32 imm; /* signed immediate constant */ -}; - #ifdef CONFIG_COMPAT /* A struct sock_filter is architecture independent. */ struct compat_sock_fprog { diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 24e9033f8b3f..fb3f7b675229 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -67,6 +67,7 @@ header-y += bfs_fs.h header-y += binfmts.h header-y += blkpg.h header-y += blktrace_api.h +header-y += bpf.h header-y += bpqether.h header-y += bsg.h header-y += btrfs.h diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h new file mode 100644 index 000000000000..479ed0b6be16 --- /dev/null +++ b/include/uapi/linux/bpf.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef _UAPI__LINUX_BPF_H__ +#define _UAPI__LINUX_BPF_H__ + +#include + +/* Extended instruction set based on top of classic BPF */ + +/* instruction classes */ +#define BPF_ALU64 0x07 /* alu mode in double word width */ + +/* ld/ldx fields */ +#define BPF_DW 0x18 /* double word */ +#define BPF_XADD 0xc0 /* exclusive add */ + +/* alu/jmp fields */ +#define BPF_MOV 0xb0 /* mov reg to reg */ +#define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */ + +/* change endianness of a register */ +#define BPF_END 0xd0 /* flags for endianness conversion: */ +#define BPF_TO_LE 0x00 /* convert to little-endian */ +#define BPF_TO_BE 0x08 /* convert to big-endian */ +#define BPF_FROM_LE BPF_TO_LE +#define BPF_FROM_BE BPF_TO_BE + +#define BPF_JNE 0x50 /* jump != */ +#define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ +#define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ +#define BPF_CALL 0x80 /* function call */ +#define BPF_EXIT 0x90 /* function return */ + +/* Register numbers */ +enum { + BPF_REG_0 = 0, + BPF_REG_1, + BPF_REG_2, + BPF_REG_3, + BPF_REG_4, + BPF_REG_5, + BPF_REG_6, + BPF_REG_7, + BPF_REG_8, + BPF_REG_9, + BPF_REG_10, + __MAX_BPF_REG, +}; + +/* BPF has 10 general purpose 64-bit registers and stack frame. */ +#define MAX_BPF_REG __MAX_BPF_REG + +struct bpf_insn { + __u8 code; /* opcode */ + __u8 dst_reg:4; /* dest register */ + __u8 src_reg:4; /* source register */ + __s16 off; /* signed offset */ + __s32 imm; /* signed immediate constant */ +}; + +#endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3-59-g8ed1b From dc8ecdd3a3fccf73fcb07711cde064ce5727f9d1 Mon Sep 17 00:00:00 2001 From: Rafał Miłecki Date: Mon, 1 Sep 2014 23:11:06 +0200 Subject: bcma: move bus struct setup into early part of host specific code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change is important for SoC host. In future we will want to know chip ID (needed for early MIPS boot) before doing cores scanning. Signed-off-by: Rafał Miłecki Acked-by: Hauke Mehrtens Signed-off-by: John W. Linville --- drivers/bcma/host_pci.c | 3 +++ drivers/bcma/host_soc.c | 3 +++ drivers/bcma/main.c | 2 -- drivers/bcma/scan.c | 7 ------- include/linux/bcma/bcma.h | 1 - 5 files changed, 6 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/bcma/host_pci.c b/drivers/bcma/host_pci.c index f032ed6dd459..1e5ac0a79696 100644 --- a/drivers/bcma/host_pci.c +++ b/drivers/bcma/host_pci.c @@ -208,6 +208,9 @@ static int bcma_host_pci_probe(struct pci_dev *dev, bus->boardinfo.vendor = bus->host_pci->subsystem_vendor; bus->boardinfo.type = bus->host_pci->subsystem_device; + /* Initialize struct, detect chip */ + bcma_init_bus(bus); + /* Register */ err = bcma_bus_register(bus); if (err) diff --git a/drivers/bcma/host_soc.c b/drivers/bcma/host_soc.c index 1edd7e064621..379e0d4ebe72 100644 --- a/drivers/bcma/host_soc.c +++ b/drivers/bcma/host_soc.c @@ -178,6 +178,9 @@ int __init bcma_host_soc_register(struct bcma_soc *soc) bus->hosttype = BCMA_HOSTTYPE_SOC; bus->ops = &bcma_host_soc_ops; + /* Initialize struct, detect chip */ + bcma_init_bus(bus); + /* Register */ err = bcma_bus_early_register(bus, &soc->core_cc, &soc->core_mips); if (err) diff --git a/drivers/bcma/main.c b/drivers/bcma/main.c index 0ff8d58831ef..9f6b0cb9a12c 100644 --- a/drivers/bcma/main.c +++ b/drivers/bcma/main.c @@ -334,8 +334,6 @@ int __init bcma_bus_early_register(struct bcma_bus *bus, struct bcma_device *core; struct bcma_device_id match; - bcma_init_bus(bus); - match.manuf = BCMA_MANUF_BCM; match.id = bcma_cc_core_id(bus); match.class = BCMA_CL_SIM; diff --git a/drivers/bcma/scan.c b/drivers/bcma/scan.c index e9bd77249a4c..e2b990303042 100644 --- a/drivers/bcma/scan.c +++ b/drivers/bcma/scan.c @@ -438,9 +438,6 @@ void bcma_init_bus(struct bcma_bus *bus) s32 tmp; struct bcma_chipinfo *chipinfo = &(bus->chipinfo); - if (bus->init_done) - return; - INIT_LIST_HEAD(&bus->cores); bus->nr_cores = 0; @@ -452,8 +449,6 @@ void bcma_init_bus(struct bcma_bus *bus) chipinfo->pkg = (tmp & BCMA_CC_ID_PKG) >> BCMA_CC_ID_PKG_SHIFT; bcma_info(bus, "Found chip with id 0x%04X, rev 0x%02X and package 0x%02X\n", chipinfo->id, chipinfo->rev, chipinfo->pkg); - - bus->init_done = true; } int bcma_bus_scan(struct bcma_bus *bus) @@ -463,8 +458,6 @@ int bcma_bus_scan(struct bcma_bus *bus) int err, core_num = 0; - bcma_init_bus(bus); - erombase = bcma_scan_read32(bus, 0, BCMA_CC_EROM); if (bus->hosttype == BCMA_HOSTTYPE_SOC) { eromptr = ioremap_nocache(erombase, BCMA_CORE_SIZE); diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index 0272e49135d0..c1ba87d1548e 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -332,7 +332,6 @@ struct bcma_bus { struct bcma_device *mapped_core; struct list_head cores; u8 nr_cores; - u8 init_done:1; u8 num; struct bcma_drv_cc drv_cc; -- cgit v1.2.3-59-g8ed1b From a395135ddebb0a06052b84c309eb6cb68b79c797 Mon Sep 17 00:00:00 2001 From: Rafał Miłecki Date: Mon, 1 Sep 2014 23:11:07 +0200 Subject: bcma: use separated function to initialize bus on SoC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is required to split SoC bus init into two phases. The later one (which includes scanning) should be called when kalloc is available. Cc: Ralf Baechle Signed-off-by: Rafał Miłecki Acked-by: Hauke Mehrtens Signed-off-by: John W. Linville --- arch/mips/bcm47xx/setup.c | 4 ++++ drivers/bcma/host_soc.c | 11 +++++++++-- include/linux/bcma/bcma_soc.h | 1 + 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/mips/bcm47xx/setup.c b/arch/mips/bcm47xx/setup.c index 2b63e7e7d3d3..fff6ed4978c7 100644 --- a/arch/mips/bcm47xx/setup.c +++ b/arch/mips/bcm47xx/setup.c @@ -201,6 +201,10 @@ static void __init bcm47xx_register_bcma(void) pr_warn("bcm47xx: someone else already registered a bcma SPROM callback handler (err %d)\n", err); err = bcma_host_soc_register(&bcm47xx_bus.bcma); + if (err) + panic("Failed to register BCMA bus (err %d)", err); + + err = bcma_host_soc_init(&bcm47xx_bus.bcma); if (err) panic("Failed to initialize BCMA bus (err %d)", err); diff --git a/drivers/bcma/host_soc.c b/drivers/bcma/host_soc.c index 379e0d4ebe72..718e054dd727 100644 --- a/drivers/bcma/host_soc.c +++ b/drivers/bcma/host_soc.c @@ -165,7 +165,6 @@ static const struct bcma_host_ops bcma_host_soc_ops = { int __init bcma_host_soc_register(struct bcma_soc *soc) { struct bcma_bus *bus = &soc->bus; - int err; /* iomap only first core. We have to read some register on this core * to scan the bus. @@ -181,7 +180,15 @@ int __init bcma_host_soc_register(struct bcma_soc *soc) /* Initialize struct, detect chip */ bcma_init_bus(bus); - /* Register */ + return 0; +} + +int __init bcma_host_soc_init(struct bcma_soc *soc) +{ + struct bcma_bus *bus = &soc->bus; + int err; + + /* Scan bus and initialize it */ err = bcma_bus_early_register(bus, &soc->core_cc, &soc->core_mips); if (err) iounmap(bus->mmio); diff --git a/include/linux/bcma/bcma_soc.h b/include/linux/bcma/bcma_soc.h index 4203c5593b9f..f24d245f8394 100644 --- a/include/linux/bcma/bcma_soc.h +++ b/include/linux/bcma/bcma_soc.h @@ -10,6 +10,7 @@ struct bcma_soc { }; int __init bcma_host_soc_register(struct bcma_soc *soc); +int __init bcma_host_soc_init(struct bcma_soc *soc); int bcma_bus_register(struct bcma_bus *bus); -- cgit v1.2.3-59-g8ed1b From 23a2f39c8f4035eade7f226eb7ada30c78d9eee3 Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Mon, 8 Sep 2014 22:53:35 +0200 Subject: bcma: store more alternative addresses Each core could have more than one alternative address. There are cores with 8 alternative addresses for different functions. The PHY control in the Chip common B core is done through the 2. alternative address and not the first one. Signed-off-by: Hauke Mehrtens CC: linux-usb@vger.kernel.org Signed-off-by: John W. Linville --- drivers/bcma/scan.c | 9 +++++---- drivers/usb/host/bcma-hcd.c | 2 +- include/linux/bcma/bcma.h | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/bcma/scan.c b/drivers/bcma/scan.c index e2b990303042..06be7282cbc4 100644 --- a/drivers/bcma/scan.c +++ b/drivers/bcma/scan.c @@ -276,7 +276,7 @@ static int bcma_get_next_core(struct bcma_bus *bus, u32 __iomem **eromptr, struct bcma_device *core) { u32 tmp; - u8 i, j; + u8 i, j, k; s32 cia, cib; u8 ports[2], wrappers[2]; @@ -367,6 +367,7 @@ static int bcma_get_next_core(struct bcma_bus *bus, u32 __iomem **eromptr, core->addr = tmp; /* get & parse slave ports */ + k = 0; for (i = 0; i < ports[1]; i++) { for (j = 0; ; j++) { tmp = bcma_erom_get_addr_desc(bus, eromptr, @@ -376,9 +377,9 @@ static int bcma_get_next_core(struct bcma_bus *bus, u32 __iomem **eromptr, /* pr_debug("erom: slave port %d " * "has %d descriptors\n", i, j); */ break; - } else { - if (i == 0 && j == 0) - core->addr1 = tmp; + } else if (k < ARRAY_SIZE(core->addr_s)) { + core->addr_s[k] = tmp; + k++; } } } diff --git a/drivers/usb/host/bcma-hcd.c b/drivers/usb/host/bcma-hcd.c index 205f4a336583..cd6d0afb6b8f 100644 --- a/drivers/usb/host/bcma-hcd.c +++ b/drivers/usb/host/bcma-hcd.c @@ -237,7 +237,7 @@ static int bcma_hcd_probe(struct bcma_device *dev) bcma_hcd_init_chip(dev); /* In AI chips EHCI is addrspace 0, OHCI is 1 */ - ohci_addr = dev->addr1; + ohci_addr = dev->addr_s[0]; if ((chipinfo->id == 0x5357 || chipinfo->id == 0x4749) && chipinfo->rev == 0) ohci_addr = 0x18009000; diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index c1ba87d1548e..7fc16c991291 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -267,7 +267,7 @@ struct bcma_device { u8 core_unit; u32 addr; - u32 addr1; + u32 addr_s[8]; u32 wrap; void __iomem *io_addr; -- cgit v1.2.3-59-g8ed1b From 1716bcf3f76fe71e98d4851a3eb73ea3d93d4773 Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Mon, 8 Sep 2014 22:53:36 +0200 Subject: bcma: add support for chipcommon B core This core is used on BCM4708 to configure the PCIe and USB3 PHYs and it contains the addresses to the Device Management unit. This will be used by the PCIe driver first. Signed-off-by: Hauke Mehrtens Signed-off-by: John W. Linville --- drivers/bcma/Makefile | 1 + drivers/bcma/bcma_private.h | 4 ++ drivers/bcma/driver_chipcommon_b.c | 61 +++++++++++++++++++++++++++++ drivers/bcma/main.c | 10 +++++ drivers/bcma/scan.c | 1 + include/linux/bcma/bcma.h | 1 + include/linux/bcma/bcma_driver_chipcommon.h | 8 ++++ 7 files changed, 86 insertions(+) create mode 100644 drivers/bcma/driver_chipcommon_b.c (limited to 'include/linux') diff --git a/drivers/bcma/Makefile b/drivers/bcma/Makefile index 91290f7f61b8..838b4b9d352f 100644 --- a/drivers/bcma/Makefile +++ b/drivers/bcma/Makefile @@ -1,5 +1,6 @@ bcma-y += main.o scan.o core.o sprom.o bcma-y += driver_chipcommon.o driver_chipcommon_pmu.o +bcma-y += driver_chipcommon_b.o bcma-$(CONFIG_BCMA_SFLASH) += driver_chipcommon_sflash.o bcma-$(CONFIG_BCMA_NFLASH) += driver_chipcommon_nflash.o bcma-y += driver_pci.o diff --git a/drivers/bcma/bcma_private.h b/drivers/bcma/bcma_private.h index 09b632ad0fe2..b40be43c6f31 100644 --- a/drivers/bcma/bcma_private.h +++ b/drivers/bcma/bcma_private.h @@ -50,6 +50,10 @@ void bcma_chipco_serial_init(struct bcma_drv_cc *cc); extern struct platform_device bcma_pflash_dev; #endif /* CONFIG_BCMA_DRIVER_MIPS */ +/* driver_chipcommon_b.c */ +int bcma_core_chipcommon_b_init(struct bcma_drv_cc_b *ccb); +void bcma_core_chipcommon_b_free(struct bcma_drv_cc_b *ccb); + /* driver_chipcommon_pmu.c */ u32 bcma_pmu_get_alp_clock(struct bcma_drv_cc *cc); u32 bcma_pmu_get_cpu_clock(struct bcma_drv_cc *cc); diff --git a/drivers/bcma/driver_chipcommon_b.c b/drivers/bcma/driver_chipcommon_b.c new file mode 100644 index 000000000000..c20b5f4ff290 --- /dev/null +++ b/drivers/bcma/driver_chipcommon_b.c @@ -0,0 +1,61 @@ +/* + * Broadcom specific AMBA + * ChipCommon B Unit driver + * + * Copyright 2014, Hauke Mehrtens + * + * Licensed under the GNU/GPL. See COPYING for details. + */ + +#include "bcma_private.h" +#include +#include + +static bool bcma_wait_reg(struct bcma_bus *bus, void __iomem *addr, u32 mask, + u32 value, int timeout) +{ + unsigned long deadline = jiffies + timeout; + u32 val; + + do { + val = readl(addr); + if ((val & mask) == value) + return true; + cpu_relax(); + udelay(10); + } while (!time_after_eq(jiffies, deadline)); + + bcma_err(bus, "Timeout waiting for register %p\n", addr); + + return false; +} + +void bcma_chipco_b_mii_write(struct bcma_drv_cc_b *ccb, u32 offset, u32 value) +{ + struct bcma_bus *bus = ccb->core->bus; + + writel(offset, ccb->mii + 0x00); + bcma_wait_reg(bus, ccb->mii + 0x00, 0x0100, 0x0000, 100); + writel(value, ccb->mii + 0x04); + bcma_wait_reg(bus, ccb->mii + 0x00, 0x0100, 0x0000, 100); +} +EXPORT_SYMBOL_GPL(bcma_chipco_b_mii_write); + +int bcma_core_chipcommon_b_init(struct bcma_drv_cc_b *ccb) +{ + if (ccb->setup_done) + return 0; + + ccb->setup_done = 1; + ccb->mii = ioremap_nocache(ccb->core->addr_s[1], BCMA_CORE_SIZE); + if (!ccb->mii) + return -ENOMEM; + + return 0; +} + +void bcma_core_chipcommon_b_free(struct bcma_drv_cc_b *ccb) +{ + if (ccb->mii) + iounmap(ccb->mii); +} diff --git a/drivers/bcma/main.c b/drivers/bcma/main.c index 297a2d46985a..c421403cab43 100644 --- a/drivers/bcma/main.c +++ b/drivers/bcma/main.c @@ -173,6 +173,7 @@ static int bcma_register_devices(struct bcma_bus *bus) switch (core->id.id) { case BCMA_CORE_4706_CHIPCOMMON: case BCMA_CORE_CHIPCOMMON: + case BCMA_CORE_NS_CHIPCOMMON_B: case BCMA_CORE_PCI: case BCMA_CORE_PCIE: case BCMA_CORE_PCIE2: @@ -287,6 +288,13 @@ int bcma_bus_register(struct bcma_bus *bus) bcma_core_chipcommon_init(&bus->drv_cc); } + /* Init CC core */ + core = bcma_find_core(bus, BCMA_CORE_NS_CHIPCOMMON_B); + if (core) { + bus->drv_cc_b.core = core; + bcma_core_chipcommon_b_init(&bus->drv_cc_b); + } + /* Init MIPS core */ core = bcma_find_core(bus, BCMA_CORE_MIPS_74K); if (core) { @@ -341,6 +349,8 @@ void bcma_bus_unregister(struct bcma_bus *bus) else if (err) bcma_err(bus, "Can not unregister GPIO driver: %i\n", err); + bcma_core_chipcommon_b_free(&bus->drv_cc_b); + cores[0] = bcma_find_core(bus, BCMA_CORE_MIPS_74K); cores[1] = bcma_find_core(bus, BCMA_CORE_PCIE); cores[2] = bcma_find_core(bus, BCMA_CORE_4706_MAC_GBIT_COMMON); diff --git a/drivers/bcma/scan.c b/drivers/bcma/scan.c index 06be7282cbc4..b3a403c136fb 100644 --- a/drivers/bcma/scan.c +++ b/drivers/bcma/scan.c @@ -314,6 +314,7 @@ static int bcma_get_next_core(struct bcma_bus *bus, u32 __iomem **eromptr, /* Some specific cores don't need wrappers */ switch (core->id.id) { case BCMA_CORE_4706_MAC_GBIT_COMMON: + case BCMA_CORE_NS_CHIPCOMMON_B: /* Not used yet: case BCMA_CORE_OOB_ROUTER: */ break; default: diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index 7fc16c991291..634597917670 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -335,6 +335,7 @@ struct bcma_bus { u8 num; struct bcma_drv_cc drv_cc; + struct bcma_drv_cc_b drv_cc_b; struct bcma_drv_pci drv_pci[2]; struct bcma_drv_pcie2 drv_pcie2; struct bcma_drv_mips drv_mips; diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h index 63d105cd14a3..db6fa217f98b 100644 --- a/include/linux/bcma/bcma_driver_chipcommon.h +++ b/include/linux/bcma/bcma_driver_chipcommon.h @@ -644,6 +644,12 @@ struct bcma_drv_cc { #endif }; +struct bcma_drv_cc_b { + struct bcma_device *core; + u8 setup_done:1; + void __iomem *mii; +}; + /* Register access */ #define bcma_cc_read32(cc, offset) \ bcma_read32((cc)->core, offset) @@ -699,4 +705,6 @@ extern void bcma_pmu_spuravoid_pllupdate(struct bcma_drv_cc *cc, int spuravoid); extern u32 bcma_pmu_get_bus_clock(struct bcma_drv_cc *cc); +void bcma_chipco_b_mii_write(struct bcma_drv_cc_b *ccb, u32 offset, u32 value); + #endif /* LINUX_BCMA_DRIVER_CC_H_ */ -- cgit v1.2.3-59-g8ed1b From 738cbe72adc5c8f2016c4c68aa5162631d4f27e1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 8 Sep 2014 08:04:47 +0200 Subject: net: bpf: consolidate JIT binary allocator Introduced in commit 314beb9bcabf ("x86: bpf_jit_comp: secure bpf jit against spraying attacks") and later on replicated in aa2d2c73c21f ("s390/bpf,jit: address randomize and write protect jit code") for s390 architecture, write protection for BPF JIT images got added and a random start address of the JIT code, so that it's not on a page boundary anymore. Since both use a very similar allocator for the BPF binary header, we can consolidate this code into the BPF core as it's mostly JIT independant anyway. This will also allow for future archs that support DEBUG_SET_MODULE_RONX to just reuse instead of reimplementing it. JIT tested on x86_64 and s390x with BPF test suite. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Eric Dumazet Cc: Heiko Carstens Cc: Martin Schwidefsky Signed-off-by: David S. Miller --- arch/s390/net/bpf_jit_comp.c | 45 ++++++++------------------------------- arch/x86/net/bpf_jit_comp.c | 50 ++++++++++---------------------------------- include/linux/filter.h | 13 ++++++++++++ kernel/bpf/core.c | 39 ++++++++++++++++++++++++++++++++++ 4 files changed, 72 insertions(+), 75 deletions(-) (limited to 'include/linux') diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index f2833c5b218a..b734f975c22e 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -5,11 +5,9 @@ * * Author(s): Martin Schwidefsky */ -#include #include #include #include -#include #include #include #include @@ -148,6 +146,12 @@ struct bpf_jit { ret; \ }) +static void bpf_jit_fill_hole(void *area, unsigned int size) +{ + /* Fill whole space with illegal instructions */ + memset(area, 0, size); +} + static void bpf_jit_prologue(struct bpf_jit *jit) { /* Save registers and create stack frame if necessary */ @@ -780,38 +784,6 @@ out: return -1; } -/* - * Note: for security reasons, bpf code will follow a randomly - * sized amount of illegal instructions. - */ -struct bpf_binary_header { - unsigned int pages; - u8 image[]; -}; - -static struct bpf_binary_header *bpf_alloc_binary(unsigned int bpfsize, - u8 **image_ptr) -{ - struct bpf_binary_header *header; - unsigned int sz, hole; - - /* Most BPF filters are really small, but if some of them fill a page, - * allow at least 128 extra bytes for illegal instructions. - */ - sz = round_up(bpfsize + sizeof(*header) + 128, PAGE_SIZE); - header = module_alloc(sz); - if (!header) - return NULL; - memset(header, 0, sz); - header->pages = sz / PAGE_SIZE; - hole = min(sz - (bpfsize + sizeof(*header)), PAGE_SIZE - sizeof(*header)); - /* Insert random number of illegal instructions before BPF code - * and make sure the first instruction starts at an even address. - */ - *image_ptr = &header->image[(prandom_u32() % hole) & -2]; - return header; -} - void bpf_jit_compile(struct bpf_prog *fp) { struct bpf_binary_header *header = NULL; @@ -850,7 +822,8 @@ void bpf_jit_compile(struct bpf_prog *fp) size = prg_len + lit_len; if (size >= BPF_SIZE_MAX) goto out; - header = bpf_alloc_binary(size, &jit.start); + header = bpf_jit_binary_alloc(size, &jit.start, + 2, bpf_jit_fill_hole); if (!header) goto out; jit.prg = jit.mid = jit.start + prg_len; @@ -884,7 +857,7 @@ void bpf_jit_free(struct bpf_prog *fp) goto free_filter; set_memory_rw(addr, header->pages); - module_free(NULL, header); + bpf_jit_binary_free(header); free_filter: bpf_prog_unlock_free(fp); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 06f8c17f5484..9de0b5476b0c 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -8,12 +8,10 @@ * as published by the Free Software Foundation; version 2 * of the License. */ -#include -#include #include #include #include -#include +#include int bpf_jit_enable __read_mostly; @@ -109,39 +107,6 @@ static inline void bpf_flush_icache(void *start, void *end) #define CHOOSE_LOAD_FUNC(K, func) \ ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset) -struct bpf_binary_header { - unsigned int pages; - /* Note : for security reasons, bpf code will follow a randomly - * sized amount of int3 instructions - */ - u8 image[]; -}; - -static struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen, - u8 **image_ptr) -{ - unsigned int sz, hole; - struct bpf_binary_header *header; - - /* Most of BPF filters are really small, - * but if some of them fill a page, allow at least - * 128 extra bytes to insert a random section of int3 - */ - sz = round_up(proglen + sizeof(*header) + 128, PAGE_SIZE); - header = module_alloc(sz); - if (!header) - return NULL; - - memset(header, 0xcc, sz); /* fill whole space with int3 instructions */ - - header->pages = sz / PAGE_SIZE; - hole = min(sz - (proglen + sizeof(*header)), PAGE_SIZE - sizeof(*header)); - - /* insert a random number of int3 instructions before BPF code */ - *image_ptr = &header->image[prandom_u32() % hole]; - return header; -} - /* pick a register outside of BPF range for JIT internal work */ #define AUX_REG (MAX_BPF_REG + 1) @@ -206,6 +171,12 @@ static inline u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg) return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3); } +static void jit_fill_hole(void *area, unsigned int size) +{ + /* fill whole space with int3 instructions */ + memset(area, 0xcc, size); +} + struct jit_context { unsigned int cleanup_addr; /* epilogue code offset */ bool seen_ld_abs; @@ -959,7 +930,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog) if (proglen <= 0) { image = NULL; if (header) - module_free(NULL, header); + bpf_jit_binary_free(header); goto out; } if (image) { @@ -969,7 +940,8 @@ void bpf_int_jit_compile(struct bpf_prog *prog) break; } if (proglen == oldproglen) { - header = bpf_alloc_binary(proglen, &image); + header = bpf_jit_binary_alloc(proglen, &image, + 1, jit_fill_hole); if (!header) goto out; } @@ -998,7 +970,7 @@ void bpf_jit_free(struct bpf_prog *fp) goto free_filter; set_memory_rw(addr, header->pages); - module_free(NULL, header); + bpf_jit_binary_free(header); free_filter: bpf_prog_unlock_free(fp); diff --git a/include/linux/filter.h b/include/linux/filter.h index 8f82ef3f1cdd..868764fcffb8 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -289,6 +289,11 @@ struct sock_fprog_kern { struct sock_filter *filter; }; +struct bpf_binary_header { + unsigned int pages; + u8 image[]; +}; + struct bpf_work_struct { struct bpf_prog *prog; struct work_struct work; @@ -358,6 +363,14 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); void __bpf_prog_free(struct bpf_prog *fp); +typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); + +struct bpf_binary_header * +bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, + bpf_jit_fill_hole_t bpf_fill_ill_insns); +void bpf_jit_binary_free(struct bpf_binary_header *hdr); + static inline void bpf_prog_unlock_free(struct bpf_prog *fp) { bpf_prog_unlock_ro(fp); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2c2bfaacce66..8ee520f0ec70 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -20,9 +20,12 @@ * Andi Kleen - Fix a few bad bugs and races. * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ + #include #include #include +#include +#include #include /* Registers */ @@ -125,6 +128,42 @@ void __bpf_prog_free(struct bpf_prog *fp) } EXPORT_SYMBOL_GPL(__bpf_prog_free); +struct bpf_binary_header * +bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, + bpf_jit_fill_hole_t bpf_fill_ill_insns) +{ + struct bpf_binary_header *hdr; + unsigned int size, hole, start; + + /* Most of BPF filters are really small, but if some of them + * fill a page, allow at least 128 extra bytes to insert a + * random section of illegal instructions. + */ + size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); + hdr = module_alloc(size); + if (hdr == NULL) + return NULL; + + /* Fill space with illegal/arch-dep instructions. */ + bpf_fill_ill_insns(hdr, size); + + hdr->pages = size / PAGE_SIZE; + hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), + PAGE_SIZE - sizeof(*hdr)); + start = (prandom_u32() % hole) & ~(alignment - 1); + + /* Leave a random number of instructions before BPF code. */ + *image_ptr = &hdr->image[start]; + + return hdr; +} + +void bpf_jit_binary_free(struct bpf_binary_header *hdr) +{ + module_free(NULL, hdr); +} + /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs * anyway later on, so do not let the compiler omit it. -- cgit v1.2.3-59-g8ed1b From 286aad3c4014ca825c447e07e24f8929e6d266d2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 8 Sep 2014 08:04:49 +0200 Subject: net: bpf: be friendly to kmemcheck Reported by Mikulas Patocka, kmemcheck currently barks out a false positive since we don't have special kmemcheck annotation for bitfields used in bpf_prog structure. We currently have jited:1, len:31 and thus when accessing len while CONFIG_KMEMCHECK enabled, kmemcheck throws a warning that we're reading uninitialized memory. As we don't need the whole bit universe for pages member, we can just split it to u16 and use a bool flag for jited instead of a bitfield. Signed-off-by: Mikulas Patocka Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/arm/net/bpf_jit_32.c | 2 +- arch/mips/net/bpf_jit.c | 2 +- arch/powerpc/net/bpf_jit_comp.c | 2 +- arch/s390/net/bpf_jit_comp.c | 2 +- arch/sparc/net/bpf_jit_comp.c | 2 +- arch/x86/net/bpf_jit_comp.c | 2 +- include/linux/filter.h | 6 +++--- net/core/filter.c | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 2d1a5b93d91c..6b45f649eff0 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -933,7 +933,7 @@ void bpf_jit_compile(struct bpf_prog *fp) set_memory_ro((unsigned long)header, header->pages); fp->bpf_func = (void *)ctx.target; - fp->jited = 1; + fp->jited = true; out: kfree(ctx.offsets); return; diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index cfa83cf2447d..0e97ccd29fe3 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -1417,7 +1417,7 @@ void bpf_jit_compile(struct bpf_prog *fp) bpf_jit_dump(fp->len, alloc_size, 2, ctx.target); fp->bpf_func = (void *)ctx.target; - fp->jited = 1; + fp->jited = true; out: kfree(ctx.offsets); diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 40c53ff59124..cbae2dfd053c 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -686,7 +686,7 @@ void bpf_jit_compile(struct bpf_prog *fp) ((u64 *)image)[0] = (u64)code_base; ((u64 *)image)[1] = local_paca->kernel_toc; fp->bpf_func = (void *)image; - fp->jited = 1; + fp->jited = true; } out: kfree(addrs); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index b734f975c22e..555f5c7e83ab 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -842,7 +842,7 @@ void bpf_jit_compile(struct bpf_prog *fp) if (jit.start) { set_memory_ro((unsigned long)header, header->pages); fp->bpf_func = (void *) jit.start; - fp->jited = 1; + fp->jited = true; } out: kfree(addrs); diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c index f7a736b645e8..b2ad9dc5425e 100644 --- a/arch/sparc/net/bpf_jit_comp.c +++ b/arch/sparc/net/bpf_jit_comp.c @@ -801,7 +801,7 @@ cond_branch: f_offset = addrs[i + filter[i].jf]; if (image) { bpf_flush_icache(image, image + proglen); fp->bpf_func = (void *)image; - fp->jited = 1; + fp->jited = true; } out: kfree(addrs); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 9de0b5476b0c..d56cd1f515bd 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -955,7 +955,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog) bpf_flush_icache(header, image + proglen); set_memory_ro((unsigned long)header, header->pages); prog->bpf_func = (void *)image; - prog->jited = 1; + prog->jited = true; } out: kfree(addrs); diff --git a/include/linux/filter.h b/include/linux/filter.h index 868764fcffb8..4b59edead908 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -300,9 +300,9 @@ struct bpf_work_struct { }; struct bpf_prog { - u32 pages; /* Number of allocated pages */ - u32 jited:1, /* Is our filter JIT'ed? */ - len:31; /* Number of filter blocks */ + u16 pages; /* Number of allocated pages */ + bool jited; /* Is our filter JIT'ed? */ + u32 len; /* Number of filter blocks */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ struct bpf_work_struct *work; /* Deferred free work struct */ unsigned int (*bpf_func)(const struct sk_buff *skb, diff --git a/net/core/filter.c b/net/core/filter.c index fa5b7d0f77ac..dfc716ffa44b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -972,7 +972,7 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp) int err; fp->bpf_func = NULL; - fp->jited = 0; + fp->jited = false; err = bpf_check_classic(fp->insns, fp->len); if (err) { -- cgit v1.2.3-59-g8ed1b From b954d83421d51d822c42e5ab7b65069b25ad3005 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 10 Sep 2014 15:01:02 +0200 Subject: net: bpf: only build bpf_jit_binary_{alloc, free}() when jit selected Since BPF JIT depends on the availability of module_alloc() and module_free() helpers (HAVE_BPF_JIT and MODULES), we better build that code only in case we have BPF_JIT in our config enabled, just like with other JIT code. Fixes builds for arm/marzen_defconfig and sh/rsk7269_defconfig. ==================== kernel/built-in.o: In function `bpf_jit_binary_alloc': /home/cwang/linux/kernel/bpf/core.c:144: undefined reference to `module_alloc' kernel/built-in.o: In function `bpf_jit_binary_free': /home/cwang/linux/kernel/bpf/core.c:164: undefined reference to `module_free' make: *** [vmlinux] Error 1 ==================== Reported-by: Fengguang Wu Fixes: 738cbe72adc5 ("net: bpf: consolidate JIT binary allocator") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 78 +++++++++++++++++++++++++------------------------- kernel/bpf/core.c | 2 ++ 2 files changed, 41 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 4b59edead908..1a0bc6d134d7 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -4,12 +4,18 @@ #ifndef __LINUX_FILTER_H__ #define __LINUX_FILTER_H__ +#include + #include #include #include +#include +#include #include -#include + #include + +#include #include struct sk_buff; @@ -363,14 +369,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); void __bpf_prog_free(struct bpf_prog *fp); -typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); - -struct bpf_binary_header * -bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, - unsigned int alignment, - bpf_jit_fill_hole_t bpf_fill_ill_insns); -void bpf_jit_binary_free(struct bpf_binary_header *hdr); - static inline void bpf_prog_unlock_free(struct bpf_prog *fp) { bpf_prog_unlock_ro(fp); @@ -393,6 +391,38 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); void bpf_int_jit_compile(struct bpf_prog *fp); +#ifdef CONFIG_BPF_JIT +typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); + +struct bpf_binary_header * +bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, + bpf_jit_fill_hole_t bpf_fill_ill_insns); +void bpf_jit_binary_free(struct bpf_binary_header *hdr); + +void bpf_jit_compile(struct bpf_prog *fp); +void bpf_jit_free(struct bpf_prog *fp); + +static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, + u32 pass, void *image) +{ + pr_err("flen=%u proglen=%u pass=%u image=%pK\n", + flen, proglen, pass, image); + if (image) + print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET, + 16, 1, image, proglen, false); +} +#else +static inline void bpf_jit_compile(struct bpf_prog *fp) +{ +} + +static inline void bpf_jit_free(struct bpf_prog *fp) +{ + bpf_prog_unlock_free(fp); +} +#endif /* CONFIG_BPF_JIT */ + #define BPF_ANC BIT(15) static inline u16 bpf_anc_helper(const struct sock_filter *ftest) @@ -440,36 +470,6 @@ static inline void *bpf_load_pointer(const struct sk_buff *skb, int k, return bpf_internal_load_pointer_neg_helper(skb, k, size); } -#ifdef CONFIG_BPF_JIT -#include -#include -#include - -void bpf_jit_compile(struct bpf_prog *fp); -void bpf_jit_free(struct bpf_prog *fp); - -static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, - u32 pass, void *image) -{ - pr_err("flen=%u proglen=%u pass=%u image=%pK\n", - flen, proglen, pass, image); - if (image) - print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET, - 16, 1, image, proglen, false); -} -#else -#include - -static inline void bpf_jit_compile(struct bpf_prog *fp) -{ -} - -static inline void bpf_jit_free(struct bpf_prog *fp) -{ - bpf_prog_unlock_free(fp); -} -#endif /* CONFIG_BPF_JIT */ - static inline int bpf_tell_extensions(void) { return SKF_AD_MAX; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8ee520f0ec70..8b7002488251 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -128,6 +128,7 @@ void __bpf_prog_free(struct bpf_prog *fp) } EXPORT_SYMBOL_GPL(__bpf_prog_free); +#ifdef CONFIG_BPF_JIT struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, @@ -163,6 +164,7 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) { module_free(NULL, hdr); } +#endif /* CONFIG_BPF_JIT */ /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs -- cgit v1.2.3-59-g8ed1b From 960d01acf62747d6518694f92be5b06f67473833 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 9 Sep 2014 22:55:35 +0300 Subject: cfg80211: add WMM traffic stream API Add nl80211 and driver API to validate, add and delete traffic streams with appropriate settings. The API calls for userspace doing the action frame handshake with the peer, and then allows only to set up the parameters in the driver. To avoid setting up a session only to tear it down again, the validate API is provided, but the real usage later can still fail so userspace must be prepared for that. Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 4 ++ include/net/cfg80211.h | 23 ++++++++- include/uapi/linux/nl80211.h | 28 +++++++++++ net/wireless/nl80211.c | 109 +++++++++++++++++++++++++++++++++++++++++++ net/wireless/rdev-ops.h | 31 ++++++++++++ net/wireless/trace.h | 45 ++++++++++++++++++ 6 files changed, 239 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 61a09f0644aa..b1be39c76931 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -166,8 +166,12 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) #define IEEE80211_MAX_MESH_ID_LEN 32 +#define IEEE80211_FIRST_TSPEC_TSID 8 #define IEEE80211_NUM_TIDS 16 +/* number of user priorities 802.11 uses */ +#define IEEE80211_NUM_UPS 8 + #define IEEE80211_QOS_CTL_LEN 2 /* 1d tag mask */ #define IEEE80211_QOS_CTL_TAG1D_MASK 0x0007 diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index c2c710c14a50..a13beab123dc 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2318,6 +2318,17 @@ struct cfg80211_qos_map { * @set_ap_chanwidth: Set the AP (including P2P GO) mode channel width for the * given interface This is used e.g. for dynamic HT 20/40 MHz channel width * changes during the lifetime of the BSS. + * + * @add_tx_ts: validate (if admitted_time is 0) or add a TX TS to the device + * with the given parameters; action frame exchange has been handled by + * userspace so this just has to modify the TX path to take the TS into + * account. + * If the admitted time is 0 just validate the parameters to make sure + * the session can be created at all; it is valid to just always return + * success for that but that may result in inefficient behaviour (handshake + * with the peer followed by immediate teardown when the addition is later + * rejected) + * @del_tx_ts: remove an existing TX TS */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -2558,6 +2569,12 @@ struct cfg80211_ops { int (*set_ap_chanwidth)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_chan_def *chandef); + + int (*add_tx_ts)(struct wiphy *wiphy, struct net_device *dev, + u8 tsid, const u8 *peer, u8 user_prio, + u16 admitted_time); + int (*del_tx_ts)(struct wiphy *wiphy, struct net_device *dev, + u8 tsid, const u8 *peer); }; /* @@ -2604,9 +2621,13 @@ struct cfg80211_ops { * @WIPHY_FLAG_SUPPORTS_5_10_MHZ: Device supports 5 MHz and 10 MHz channels. * @WIPHY_FLAG_HAS_CHANNEL_SWITCH: Device supports channel switch in * beaconing mode (AP, IBSS, Mesh, ...). + * @WIPHY_FLAG_SUPPORTS_WMM_ADMISSION: the device supports setting up WMM + * TSPEC sessions (TID aka TSID 0-7) with the NL80211_CMD_ADD_TX_TS + * command. Standard IEEE 802.11 TSPEC setup is not yet supported, it + * needs to be able to handle Block-Ack agreements and other things. */ enum wiphy_flags { - /* use hole at 0 */ + WIPHY_FLAG_SUPPORTS_WMM_ADMISSION = BIT(0), /* use hole at 1 */ /* use hole at 2 */ WIPHY_FLAG_NETNS_OK = BIT(3), diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 29c4399e08b9..e5b8caf5e466 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -722,6 +722,22 @@ * QoS mapping is relevant for IP packets, it is only valid during an * association. This is cleared on disassociation and AP restart. * + * @NL80211_CMD_ADD_TX_TS: Ask the kernel to add a traffic stream for the given + * %NL80211_ATTR_TSID and %NL80211_ATTR_MAC with %NL80211_ATTR_USER_PRIO + * and %NL80211_ATTR_ADMITTED_TIME parameters. + * Note that the action frame handshake with the AP shall be handled by + * userspace via the normal management RX/TX framework, this only sets + * up the TX TS in the driver/device. + * If the admitted time attribute is not added then the request just checks + * if a subsequent setup could be successful, the intent is to use this to + * avoid setting up a session with the AP when local restrictions would + * make that impossible. However, the subsequent "real" setup may still + * fail even if the check was successful. + * @NL80211_CMD_DEL_TX_TS: Remove an existing TS with the %NL80211_ATTR_TSID + * and %NL80211_ATTR_MAC parameters. It isn't necessary to call this + * before removing a station entry entirely, or before disassociating + * or similar, cleanup will happen in the driver/device in this case. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -893,6 +909,9 @@ enum nl80211_commands { NL80211_CMD_SET_QOS_MAP, + NL80211_CMD_ADD_TX_TS, + NL80211_CMD_DEL_TX_TS, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -1611,6 +1630,11 @@ enum nl80211_commands { * drivers to indicate dynack capability. Dynack is automatically disabled * setting valid value for coverage class. * + * @NL80211_ATTR_TSID: a TSID value (u8 attribute) + * @NL80211_ATTR_USER_PRIO: user priority value (u8 attribute) + * @NL80211_ATTR_ADMITTED_TIME: admitted time in units of 32 microseconds + * (per second) (u16 attribute) + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -1957,6 +1981,10 @@ enum nl80211_attrs { NL80211_ATTR_WIPHY_DYN_ACK, + NL80211_ATTR_TSID, + NL80211_ATTR_USER_PRIO, + NL80211_ATTR_ADMITTED_TIME, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index e9fbd4f4ddb0..ab7ee4893e40 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -391,6 +391,9 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_IFACE_SOCKET_OWNER] = { .type = NLA_FLAG }, [NL80211_ATTR_CSA_C_OFFSETS_TX] = { .type = NLA_BINARY }, [NL80211_ATTR_USE_RRM] = { .type = NLA_FLAG }, + [NL80211_ATTR_TSID] = { .type = NLA_U8 }, + [NL80211_ATTR_USER_PRIO] = { .type = NLA_U8 }, + [NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 }, }; /* policy for the key attributes */ @@ -1510,6 +1513,9 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, if (rdev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH) CMD(channel_switch, CHANNEL_SWITCH); CMD(set_qos_map, SET_QOS_MAP); + if (rdev->wiphy.flags & + WIPHY_FLAG_SUPPORTS_WMM_ADMISSION) + CMD(add_tx_ts, ADD_TX_TS); } /* add into the if now */ #undef CMD @@ -9390,6 +9396,93 @@ static int nl80211_set_qos_map(struct sk_buff *skb, return ret; } +static int nl80211_add_tx_ts(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + const u8 *peer; + u8 tsid, up; + u16 admitted_time = 0; + int err; + + if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_WMM_ADMISSION)) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_TSID] || !info->attrs[NL80211_ATTR_MAC] || + !info->attrs[NL80211_ATTR_USER_PRIO]) + return -EINVAL; + + tsid = nla_get_u8(info->attrs[NL80211_ATTR_TSID]); + if (tsid >= IEEE80211_NUM_TIDS) + return -EINVAL; + + up = nla_get_u8(info->attrs[NL80211_ATTR_USER_PRIO]); + if (up >= IEEE80211_NUM_UPS) + return -EINVAL; + + /* WMM uses TIDs 0-7 even for TSPEC */ + if (tsid < IEEE80211_FIRST_TSPEC_TSID) { + if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_WMM_ADMISSION)) + return -EINVAL; + } else { + /* TODO: handle 802.11 TSPEC/admission control + * need more attributes for that (e.g. BA session requirement) + */ + return -EINVAL; + } + + peer = nla_data(info->attrs[NL80211_ATTR_MAC]); + + if (info->attrs[NL80211_ATTR_ADMITTED_TIME]) { + admitted_time = + nla_get_u16(info->attrs[NL80211_ATTR_ADMITTED_TIME]); + if (!admitted_time) + return -EINVAL; + } + + wdev_lock(wdev); + switch (wdev->iftype) { + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + if (wdev->current_bss) + break; + err = -ENOTCONN; + goto out; + default: + err = -EOPNOTSUPP; + goto out; + } + + err = rdev_add_tx_ts(rdev, dev, tsid, peer, up, admitted_time); + + out: + wdev_unlock(wdev); + return err; +} + +static int nl80211_del_tx_ts(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + const u8 *peer; + u8 tsid; + int err; + + if (!info->attrs[NL80211_ATTR_TSID] || !info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + tsid = nla_get_u8(info->attrs[NL80211_ATTR_TSID]); + peer = nla_data(info->attrs[NL80211_ATTR_MAC]); + + wdev_lock(wdev); + err = rdev_del_tx_ts(rdev, dev, tsid, peer); + wdev_unlock(wdev); + + return err; +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -10147,6 +10240,22 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_ADD_TX_TS, + .doit = nl80211_add_tx_ts, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_DEL_TX_TS, + .doit = nl80211_del_tx_ts, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, }; /* notification functions */ diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 56c2240c30ce..f6d457d6a558 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -915,4 +915,35 @@ rdev_set_ap_chanwidth(struct cfg80211_registered_device *rdev, return ret; } +static inline int +rdev_add_tx_ts(struct cfg80211_registered_device *rdev, + struct net_device *dev, u8 tsid, const u8 *peer, + u8 user_prio, u16 admitted_time) +{ + int ret = -EOPNOTSUPP; + + trace_rdev_add_tx_ts(&rdev->wiphy, dev, tsid, peer, + user_prio, admitted_time); + if (rdev->ops->add_tx_ts) + ret = rdev->ops->add_tx_ts(&rdev->wiphy, dev, tsid, peer, + user_prio, admitted_time); + trace_rdev_return_int(&rdev->wiphy, ret); + + return ret; +} + +static inline int +rdev_del_tx_ts(struct cfg80211_registered_device *rdev, + struct net_device *dev, u8 tsid, const u8 *peer) +{ + int ret = -EOPNOTSUPP; + + trace_rdev_del_tx_ts(&rdev->wiphy, dev, tsid, peer); + if (rdev->ops->del_tx_ts) + ret = rdev->ops->del_tx_ts(&rdev->wiphy, dev, tsid, peer); + trace_rdev_return_int(&rdev->wiphy, ret); + + return ret; +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 0c524cd76c83..625a6e6d1168 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1896,6 +1896,51 @@ TRACE_EVENT(rdev_set_ap_chanwidth, WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG) ); +TRACE_EVENT(rdev_add_tx_ts, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + u8 tsid, const u8 *peer, u8 user_prio, u16 admitted_time), + TP_ARGS(wiphy, netdev, tsid, peer, user_prio, admitted_time), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(peer) + __field(u8, tsid) + __field(u8, user_prio) + __field(u16, admitted_time) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(peer, peer); + __entry->tsid = tsid; + __entry->user_prio = user_prio; + __entry->admitted_time = admitted_time; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", TSID %d, UP %d, time %d", + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), + __entry->tsid, __entry->user_prio, __entry->admitted_time) +); + +TRACE_EVENT(rdev_del_tx_ts, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + u8 tsid, const u8 *peer), + TP_ARGS(wiphy, netdev, tsid, peer), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(peer) + __field(u8, tsid) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(peer, peer); + __entry->tsid = tsid; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", TSID %d", + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->tsid) +); + /************************************************************* * cfg80211 exported functions traces * *************************************************************/ -- cgit v1.2.3-59-g8ed1b From 46e5da40aec256155cfedee96dd21a75da941f2c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 12 Sep 2014 20:04:52 -0700 Subject: net: qdisc: use rcu prefix and silence sparse warnings Add __rcu notation to qdisc handling by doing this we can make smatch output more legible. And anyways some of the cases should be using rcu_dereference() see qdisc_all_tx_empty(), qdisc_tx_chainging(), and so on. Also *wake_queue() API is commonly called from driver timer routines without rcu lock or rtnl lock. So I added rcu_read_lock() blocks around netif_wake_subqueue and netif_tx_wake_queue. Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 29 ++++----------------------- include/net/sch_generic.h | 21 +++++++++++++------ net/core/dev.c | 51 +++++++++++++++++++++++++++++++++++++++++++++-- net/sched/sch_generic.c | 4 ++-- net/sched/sch_mqprio.c | 6 ++++-- net/sched/sch_teql.c | 13 +++++++----- 6 files changed, 82 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ba72f6baae1a..ae721f53739e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -543,7 +543,7 @@ struct netdev_queue { * read mostly part */ struct net_device *dev; - struct Qdisc *qdisc; + struct Qdisc __rcu *qdisc; struct Qdisc *qdisc_sleeping; #ifdef CONFIG_SYSFS struct kobject kobj; @@ -2356,12 +2356,7 @@ static inline void input_queue_tail_incr_save(struct softnet_data *sd, DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); void __netif_schedule(struct Qdisc *q); - -static inline void netif_schedule_queue(struct netdev_queue *txq) -{ - if (!(txq->state & QUEUE_STATE_ANY_XOFF)) - __netif_schedule(txq->qdisc); -} +void netif_schedule_queue(struct netdev_queue *txq); static inline void netif_tx_schedule_all(struct net_device *dev) { @@ -2397,11 +2392,7 @@ static inline void netif_tx_start_all_queues(struct net_device *dev) } } -static inline void netif_tx_wake_queue(struct netdev_queue *dev_queue) -{ - if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) - __netif_schedule(dev_queue->qdisc); -} +void netif_tx_wake_queue(struct netdev_queue *dev_queue); /** * netif_wake_queue - restart transmit @@ -2673,19 +2664,7 @@ static inline bool netif_subqueue_stopped(const struct net_device *dev, return __netif_subqueue_stopped(dev, skb_get_queue_mapping(skb)); } -/** - * netif_wake_subqueue - allow sending packets on subqueue - * @dev: network device - * @queue_index: sub queue index - * - * Resume individual transmit queue of a device with multiple transmit queues. - */ -static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index) -{ - struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); - if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) - __netif_schedule(txq->qdisc); -} +void netif_wake_subqueue(struct net_device *dev, u16 queue_index); #ifdef CONFIG_XPS int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index a3cfb8ebeb53..56838ab29b42 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -259,7 +259,9 @@ static inline spinlock_t *qdisc_lock(struct Qdisc *qdisc) static inline struct Qdisc *qdisc_root(const struct Qdisc *qdisc) { - return qdisc->dev_queue->qdisc; + struct Qdisc *q = rcu_dereference_rtnl(qdisc->dev_queue->qdisc); + + return q; } static inline struct Qdisc *qdisc_root_sleeping(const struct Qdisc *qdisc) @@ -384,7 +386,7 @@ static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i) struct Qdisc *qdisc; for (; i < dev->num_tx_queues; i++) { - qdisc = netdev_get_tx_queue(dev, i)->qdisc; + qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc); if (qdisc) { spin_lock_bh(qdisc_lock(qdisc)); qdisc_reset(qdisc); @@ -402,13 +404,18 @@ static inline void qdisc_reset_all_tx(struct net_device *dev) static inline bool qdisc_all_tx_empty(const struct net_device *dev) { unsigned int i; + + rcu_read_lock(); for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - const struct Qdisc *q = txq->qdisc; + const struct Qdisc *q = rcu_dereference(txq->qdisc); - if (q->q.qlen) + if (q->q.qlen) { + rcu_read_unlock(); return false; + } } + rcu_read_unlock(); return true; } @@ -416,9 +423,10 @@ static inline bool qdisc_all_tx_empty(const struct net_device *dev) static inline bool qdisc_tx_changing(const struct net_device *dev) { unsigned int i; + for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - if (txq->qdisc != txq->qdisc_sleeping) + if (rcu_access_pointer(txq->qdisc) != txq->qdisc_sleeping) return true; } return false; @@ -428,9 +436,10 @@ static inline bool qdisc_tx_changing(const struct net_device *dev) static inline bool qdisc_tx_is_noop(const struct net_device *dev) { unsigned int i; + for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - if (txq->qdisc != &noop_qdisc) + if (rcu_access_pointer(txq->qdisc) != &noop_qdisc) return false; } return true; diff --git a/net/core/dev.c b/net/core/dev.c index 3c6a967e5830..b3d6dbc0c696 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2177,6 +2177,53 @@ static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) return (struct dev_kfree_skb_cb *)skb->cb; } +void netif_schedule_queue(struct netdev_queue *txq) +{ + rcu_read_lock(); + if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { + struct Qdisc *q = rcu_dereference(txq->qdisc); + + __netif_schedule(q); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(netif_schedule_queue); + +/** + * netif_wake_subqueue - allow sending packets on subqueue + * @dev: network device + * @queue_index: sub queue index + * + * Resume individual transmit queue of a device with multiple transmit queues. + */ +void netif_wake_subqueue(struct net_device *dev, u16 queue_index) +{ + struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); + + if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) { + struct Qdisc *q; + + rcu_read_lock(); + q = rcu_dereference(txq->qdisc); + __netif_schedule(q); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(netif_wake_subqueue); + +void netif_tx_wake_queue(struct netdev_queue *dev_queue) +{ + if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { + struct Qdisc *q; + + rcu_read_lock(); + q = rcu_dereference(dev_queue->qdisc); + __netif_schedule(q); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(netif_tx_wake_queue); + void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) { unsigned long flags; @@ -3432,7 +3479,7 @@ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - q = rxq->qdisc; + q = rcu_dereference(rxq->qdisc); if (q != &noop_qdisc) { spin_lock(qdisc_lock(q)); if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) @@ -3449,7 +3496,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, { struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); - if (!rxq || rxq->qdisc == &noop_qdisc) + if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc) goto out; if (*pt_prev) { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 19696ebe9ebc..346ef85617d3 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -783,7 +783,7 @@ static void dev_deactivate_queue(struct net_device *dev, struct Qdisc *qdisc_default = _qdisc_default; struct Qdisc *qdisc; - qdisc = dev_queue->qdisc; + qdisc = rtnl_dereference(dev_queue->qdisc); if (qdisc) { spin_lock_bh(qdisc_lock(qdisc)); @@ -876,7 +876,7 @@ static void dev_init_scheduler_queue(struct net_device *dev, { struct Qdisc *qdisc = _qdisc; - dev_queue->qdisc = qdisc; + rcu_assign_pointer(dev_queue->qdisc, qdisc); dev_queue->qdisc_sleeping = qdisc; } diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 6749e2f540d0..37e7d25d21f1 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -231,7 +231,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) memset(&sch->qstats, 0, sizeof(sch->qstats)); for (i = 0; i < dev->num_tx_queues; i++) { - qdisc = netdev_get_tx_queue(dev, i)->qdisc; + qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc); spin_lock_bh(qdisc_lock(qdisc)); sch->q.qlen += qdisc->q.qlen; sch->bstats.bytes += qdisc->bstats.bytes; @@ -340,7 +340,9 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, spin_unlock_bh(d->lock); for (i = tc.offset; i < tc.offset + tc.count; i++) { - qdisc = netdev_get_tx_queue(dev, i)->qdisc; + struct netdev_queue *q = netdev_get_tx_queue(dev, i); + + qdisc = rtnl_dereference(q->qdisc); spin_lock_bh(qdisc_lock(qdisc)); bstats.bytes += qdisc->bstats.bytes; bstats.packets += qdisc->bstats.packets; diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index aaa8d03ed054..5cd291bd00e4 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -96,11 +96,14 @@ teql_dequeue(struct Qdisc *sch) struct teql_sched_data *dat = qdisc_priv(sch); struct netdev_queue *dat_queue; struct sk_buff *skb; + struct Qdisc *q; skb = __skb_dequeue(&dat->q); dat_queue = netdev_get_tx_queue(dat->m->dev, 0); + q = rcu_dereference_bh(dat_queue->qdisc); + if (skb == NULL) { - struct net_device *m = qdisc_dev(dat_queue->qdisc); + struct net_device *m = qdisc_dev(q); if (m) { dat->m->slaves = sch; netif_wake_queue(m); @@ -108,7 +111,7 @@ teql_dequeue(struct Qdisc *sch) } else { qdisc_bstats_update(sch, skb); } - sch->q.qlen = dat->q.qlen + dat_queue->qdisc->q.qlen; + sch->q.qlen = dat->q.qlen + q->q.qlen; return skb; } @@ -157,9 +160,9 @@ teql_destroy(struct Qdisc *sch) txq = netdev_get_tx_queue(master->dev, 0); master->slaves = NULL; - root_lock = qdisc_root_sleeping_lock(txq->qdisc); + root_lock = qdisc_root_sleeping_lock(rtnl_dereference(txq->qdisc)); spin_lock_bh(root_lock); - qdisc_reset(txq->qdisc); + qdisc_reset(rtnl_dereference(txq->qdisc)); spin_unlock_bh(root_lock); } } @@ -266,7 +269,7 @@ static inline int teql_resolve(struct sk_buff *skb, struct dst_entry *dst = skb_dst(skb); int res; - if (txq->qdisc == &noop_qdisc) + if (rcu_access_pointer(txq->qdisc) == &noop_qdisc) return -ENODEV; if (!dev->header_ops || !dst) -- cgit v1.2.3-59-g8ed1b From 331b72922c5f58d48fd5500acadc91777cc31970 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 12 Sep 2014 20:08:20 -0700 Subject: net: sched: RCU cls_tcindex Make cls_tcindex RCU safe. This patch addds a new RCU routine rcu_dereference_bh_rtnl() to check caller either holds the rcu read lock or RTNL. This is needed to handle the case where tcindex_lookup() is being called in both cases. Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 10 ++ net/sched/cls_tcindex.c | 248 ++++++++++++++++++++++++++++------------------ 2 files changed, 164 insertions(+), 94 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 167bae7bdfa4..6cacbce1a06c 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -46,6 +46,16 @@ static inline int lockdep_rtnl_is_held(void) #define rcu_dereference_rtnl(p) \ rcu_dereference_check(p, lockdep_rtnl_is_held()) +/** + * rcu_dereference_bh_rtnl - rcu_dereference_bh with debug checking + * @p: The pointer to read, prior to dereference + * + * Do an rcu_dereference_bh(p), but check caller either holds rcu_read_lock_bh() + * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference_bh() + */ +#define rcu_dereference_bh_rtnl(p) \ + rcu_dereference_bh_check(p, lockdep_rtnl_is_held()) + /** * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL * @p: The pointer to read, prior to dereferencing diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 3e9f76413b3b..a9f4279fbd69 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -32,19 +32,21 @@ struct tcindex_filter_result { struct tcindex_filter { u16 key; struct tcindex_filter_result result; - struct tcindex_filter *next; + struct tcindex_filter __rcu *next; + struct rcu_head rcu; }; struct tcindex_data { struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */ - struct tcindex_filter **h; /* imperfect hash; only used if !perfect; - NULL if unused */ + struct tcindex_filter __rcu **h; /* imperfect hash; */ + struct tcf_proto *tp; u16 mask; /* AND key with mask */ - int shift; /* shift ANDed key to the right */ - int hash; /* hash table size; 0 if undefined */ - int alloc_hash; /* allocated size */ - int fall_through; /* 0: only classify if explicit match */ + u32 shift; /* shift ANDed key to the right */ + u32 hash; /* hash table size; 0 if undefined */ + u32 alloc_hash; /* allocated size */ + u32 fall_through; /* 0: only classify if explicit match */ + struct rcu_head rcu; }; static inline int @@ -56,13 +58,18 @@ tcindex_filter_is_set(struct tcindex_filter_result *r) static struct tcindex_filter_result * tcindex_lookup(struct tcindex_data *p, u16 key) { - struct tcindex_filter *f; + if (p->perfect) { + struct tcindex_filter_result *f = p->perfect + key; + + return tcindex_filter_is_set(f) ? f : NULL; + } else if (p->h) { + struct tcindex_filter __rcu **fp; + struct tcindex_filter *f; - if (p->perfect) - return tcindex_filter_is_set(p->perfect + key) ? - p->perfect + key : NULL; - else if (p->h) { - for (f = p->h[key % p->hash]; f; f = f->next) + fp = &p->h[key % p->hash]; + for (f = rcu_dereference_bh_rtnl(*fp); + f; + fp = &f->next, f = rcu_dereference_bh_rtnl(*fp)) if (f->key == key) return &f->result; } @@ -74,7 +81,7 @@ tcindex_lookup(struct tcindex_data *p, u16 key) static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rcu_dereference(tp->root); struct tcindex_filter_result *f; int key = (skb->tc_index & p->mask) >> p->shift; @@ -99,7 +106,7 @@ static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp, static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r; pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle); @@ -129,49 +136,59 @@ static int tcindex_init(struct tcf_proto *tp) p->hash = DEFAULT_HASH_SIZE; p->fall_through = 1; - tp->root = p; + rcu_assign_pointer(tp->root, p); return 0; } - static int -__tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock) +tcindex_delete(struct tcf_proto *tp, unsigned long arg) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg; + struct tcindex_filter __rcu **walk; struct tcindex_filter *f = NULL; - pr_debug("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n", tp, arg, p, f); + pr_debug("tcindex_delete(tp %p,arg 0x%lx),p %p\n", tp, arg, p); if (p->perfect) { if (!r->res.class) return -ENOENT; } else { int i; - struct tcindex_filter **walk = NULL; - for (i = 0; i < p->hash; i++) - for (walk = p->h+i; *walk; walk = &(*walk)->next) - if (&(*walk)->result == r) + for (i = 0; i < p->hash; i++) { + walk = p->h + i; + for (f = rtnl_dereference(*walk); f; + walk = &f->next, f = rtnl_dereference(*walk)) { + if (&f->result == r) goto found; + } + } return -ENOENT; found: - f = *walk; - if (lock) - tcf_tree_lock(tp); - *walk = f->next; - if (lock) - tcf_tree_unlock(tp); + rcu_assign_pointer(*walk, rtnl_dereference(f->next)); } tcf_unbind_filter(tp, &r->res); tcf_exts_destroy(tp, &r->exts); - kfree(f); + if (f) + kfree_rcu(f, rcu); return 0; } -static int tcindex_delete(struct tcf_proto *tp, unsigned long arg) +static int tcindex_destroy_element(struct tcf_proto *tp, + unsigned long arg, + struct tcf_walker *walker) { - return __tcindex_delete(tp, arg, 1); + return tcindex_delete(tp, arg); +} + +static void __tcindex_destroy(struct rcu_head *head) +{ + struct tcindex_data *p = container_of(head, struct tcindex_data, rcu); + + kfree(p->perfect); + kfree(p->h); + kfree(p); } static inline int @@ -194,6 +211,14 @@ static void tcindex_filter_result_init(struct tcindex_filter_result *r) tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); } +static void __tcindex_partial_destroy(struct rcu_head *head) +{ + struct tcindex_data *p = container_of(head, struct tcindex_data, rcu); + + kfree(p->perfect); + kfree(p); +} + static int tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, u32 handle, struct tcindex_data *p, @@ -203,7 +228,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, int err, balloc = 0; struct tcindex_filter_result new_filter_result, *old_r = r; struct tcindex_filter_result cr; - struct tcindex_data cp; + struct tcindex_data *cp, *oldp; struct tcindex_filter *f = NULL; /* make gcc behave */ struct tcf_exts e; @@ -212,84 +237,118 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (err < 0) return err; - memcpy(&cp, p, sizeof(cp)); - tcindex_filter_result_init(&new_filter_result); + /* tcindex_data attributes must look atomic to classifier/lookup so + * allocate new tcindex data and RCU assign it onto root. Keeping + * perfect hash and hash pointers from old data. + */ + cp = kzalloc(sizeof(cp), GFP_KERNEL); + if (!cp) + return -ENOMEM; + + cp->mask = p->mask; + cp->shift = p->shift; + cp->hash = p->hash; + cp->alloc_hash = p->alloc_hash; + cp->fall_through = p->fall_through; + cp->tp = tp; + + if (p->perfect) { + cp->perfect = kmemdup(p->perfect, + sizeof(*r) * cp->hash, GFP_KERNEL); + if (!cp->perfect) + goto errout; + } + cp->h = p->h; + + memset(&new_filter_result, 0, sizeof(new_filter_result)); + tcf_exts_init(&new_filter_result.exts, + TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); tcindex_filter_result_init(&cr); if (old_r) cr.res = r->res; if (tb[TCA_TCINDEX_HASH]) - cp.hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); + cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); if (tb[TCA_TCINDEX_MASK]) - cp.mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); + cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); if (tb[TCA_TCINDEX_SHIFT]) - cp.shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); + cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); err = -EBUSY; + /* Hash already allocated, make sure that we still meet the * requirements for the allocated hash. */ - if (cp.perfect) { - if (!valid_perfect_hash(&cp) || - cp.hash > cp.alloc_hash) + if (cp->perfect) { + if (!valid_perfect_hash(cp) || + cp->hash > cp->alloc_hash) goto errout; - } else if (cp.h && cp.hash != cp.alloc_hash) + } else if (cp->h && cp->hash != cp->alloc_hash) { goto errout; + } err = -EINVAL; if (tb[TCA_TCINDEX_FALL_THROUGH]) - cp.fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]); + cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]); - if (!cp.hash) { + if (!cp->hash) { /* Hash not specified, use perfect hash if the upper limit * of the hashing index is below the threshold. */ - if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD) - cp.hash = (cp.mask >> cp.shift) + 1; + if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD) + cp->hash = (cp->mask >> cp->shift) + 1; else - cp.hash = DEFAULT_HASH_SIZE; + cp->hash = DEFAULT_HASH_SIZE; } - if (!cp.perfect && !cp.h) - cp.alloc_hash = cp.hash; + if (!cp->perfect && cp->h) + cp->alloc_hash = cp->hash; /* Note: this could be as restrictive as if (handle & ~(mask >> shift)) * but then, we'd fail handles that may become valid after some future * mask change. While this is extremely unlikely to ever matter, * the check below is safer (and also more backwards-compatible). */ - if (cp.perfect || valid_perfect_hash(&cp)) - if (handle >= cp.alloc_hash) + if (cp->perfect || valid_perfect_hash(cp)) + if (handle >= cp->alloc_hash) goto errout; err = -ENOMEM; - if (!cp.perfect && !cp.h) { - if (valid_perfect_hash(&cp)) { + if (!cp->perfect && !cp->h) { + if (valid_perfect_hash(cp)) { int i; - cp.perfect = kcalloc(cp.hash, sizeof(*r), GFP_KERNEL); - if (!cp.perfect) + cp->perfect = kcalloc(cp->hash, sizeof(*r), GFP_KERNEL); + if (!cp->perfect) goto errout; - for (i = 0; i < cp.hash; i++) - tcf_exts_init(&cp.perfect[i].exts, TCA_TCINDEX_ACT, + for (i = 0; i < cp->hash; i++) + tcf_exts_init(&cp->perfect[i].exts, + TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); balloc = 1; } else { - cp.h = kcalloc(cp.hash, sizeof(f), GFP_KERNEL); - if (!cp.h) + struct tcindex_filter __rcu **hash; + + hash = kcalloc(cp->hash, + sizeof(struct tcindex_filter *), + GFP_KERNEL); + + if (!hash) goto errout; + + cp->h = hash; balloc = 2; } } - if (cp.perfect) - r = cp.perfect + handle; + if (cp->perfect) + r = cp->perfect + handle; else - r = tcindex_lookup(&cp, handle) ? : &new_filter_result; + r = tcindex_lookup(cp, handle) ? : &new_filter_result; if (r == &new_filter_result) { f = kzalloc(sizeof(*f), GFP_KERNEL); @@ -307,33 +366,41 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, else tcf_exts_change(tp, &cr.exts, &e); - tcf_tree_lock(tp); if (old_r && old_r != r) tcindex_filter_result_init(old_r); - memcpy(p, &cp, sizeof(cp)); + oldp = p; r->res = cr.res; + rcu_assign_pointer(tp->root, cp); if (r == &new_filter_result) { - struct tcindex_filter **fp; + struct tcindex_filter *nfp; + struct tcindex_filter __rcu **fp; f->key = handle; f->result = new_filter_result; f->next = NULL; - for (fp = p->h+(handle % p->hash); *fp; fp = &(*fp)->next) - /* nothing */; - *fp = f; + + fp = p->h + (handle % p->hash); + for (nfp = rtnl_dereference(*fp); + nfp; + fp = &nfp->next, nfp = rtnl_dereference(*fp)) + ; /* nothing */ + + rcu_assign_pointer(*fp, f); } - tcf_tree_unlock(tp); + if (oldp) + call_rcu(&oldp->rcu, __tcindex_partial_destroy); return 0; errout_alloc: if (balloc == 1) - kfree(cp.perfect); + kfree(cp->perfect); else if (balloc == 2) - kfree(cp.h); + kfree(cp->h); errout: + kfree(cp); tcf_exts_destroy(tp, &e); return err; } @@ -345,7 +412,7 @@ tcindex_change(struct net *net, struct sk_buff *in_skb, { struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_TCINDEX_MAX + 1]; - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg; int err; @@ -364,10 +431,9 @@ tcindex_change(struct net *net, struct sk_buff *in_skb, tca[TCA_RATE], ovr); } - static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter *f, *next; int i; @@ -390,8 +456,8 @@ static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) if (!p->h) return; for (i = 0; i < p->hash; i++) { - for (f = p->h[i]; f; f = next) { - next = f->next; + for (f = rtnl_dereference(p->h[i]); f; f = next) { + next = rtnl_dereference(f->next); if (walker->count >= walker->skip) { if (walker->fn(tp, (unsigned long) &f->result, walker) < 0) { @@ -404,17 +470,9 @@ static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) } } - -static int tcindex_destroy_element(struct tcf_proto *tp, - unsigned long arg, struct tcf_walker *walker) -{ - return __tcindex_delete(tp, arg, 0); -} - - static void tcindex_destroy(struct tcf_proto *tp) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcf_walker walker; pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p); @@ -422,17 +480,16 @@ static void tcindex_destroy(struct tcf_proto *tp) walker.skip = 0; walker.fn = tcindex_destroy_element; tcindex_walk(tp, &walker); - kfree(p->perfect); - kfree(p->h); - kfree(p); - tp->root = NULL; + + RCU_INIT_POINTER(tp->root, NULL); + call_rcu(&p->rcu, __tcindex_destroy); } static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; @@ -455,15 +512,18 @@ static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, nla_nest_end(skb, nest); } else { if (p->perfect) { - t->tcm_handle = r-p->perfect; + t->tcm_handle = r - p->perfect; } else { struct tcindex_filter *f; + struct tcindex_filter __rcu **fp; int i; t->tcm_handle = 0; for (i = 0; !t->tcm_handle && i < p->hash; i++) { - for (f = p->h[i]; !t->tcm_handle && f; - f = f->next) { + fp = &p->h[i]; + for (f = rtnl_dereference(*fp); + !t->tcm_handle && f; + fp = &f->next, f = rtnl_dereference(*fp)) { if (&f->result == r) t->tcm_handle = f->key; } -- cgit v1.2.3-59-g8ed1b From 6c555490e0ce885a9caf0a045db69382a3ccbc9c Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 11 Sep 2014 15:35:09 -0700 Subject: ipv6: drop useless rcu_read_lock() in anycast These code is now protected by rtnl lock, rcu read lock is useless now. Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- net/core/dev.c | 14 ++++++++------ net/ipv6/anycast.c | 18 ++++++------------ 3 files changed, 16 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ae721f53739e..ee38b948d9a0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2083,8 +2083,8 @@ void __dev_remove_pack(struct packet_type *pt); void dev_add_offload(struct packet_offload *po); void dev_remove_offload(struct packet_offload *po); -struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short flags, - unsigned short mask); +struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags, + unsigned short mask); struct net_device *dev_get_by_name(struct net *net, const char *name); struct net_device *dev_get_by_name_rcu(struct net *net, const char *name); struct net_device *__dev_get_by_name(struct net *net, const char *name); diff --git a/net/core/dev.c b/net/core/dev.c index b3d6dbc0c696..e916ba8caccf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -897,23 +897,25 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) EXPORT_SYMBOL(dev_getfirstbyhwtype); /** - * dev_get_by_flags_rcu - find any device with given flags + * __dev_get_by_flags - find any device with given flags * @net: the applicable net namespace * @if_flags: IFF_* values * @mask: bitmask of bits in if_flags to check * * Search for any interface with the given flags. Returns NULL if a device * is not found or a pointer to the device. Must be called inside - * rcu_read_lock(), and result refcount is unchanged. + * rtnl_lock(), and result refcount is unchanged. */ -struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags, - unsigned short mask) +struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, + unsigned short mask) { struct net_device *dev, *ret; + ASSERT_RTNL(); + ret = NULL; - for_each_netdev_rcu(net, dev) { + for_each_netdev(net, dev) { if (((dev->flags ^ if_flags) & mask) == 0) { ret = dev; break; @@ -921,7 +923,7 @@ struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags } return ret; } -EXPORT_SYMBOL(dev_get_by_flags_rcu); +EXPORT_SYMBOL(__dev_get_by_flags); /** * dev_valid_name - check if name is okay for network device diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index ff2de7d9d8e6..3b0429bc6b5a 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -78,7 +78,6 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) pac->acl_addr = *addr; rtnl_lock(); - rcu_read_lock(); if (ifindex == 0) { struct rt6_info *rt; @@ -91,11 +90,11 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) goto error; } else { /* router, no matching interface: just pick one */ - dev = dev_get_by_flags_rcu(net, IFF_UP, - IFF_UP | IFF_LOOPBACK); + dev = __dev_get_by_flags(net, IFF_UP, + IFF_UP | IFF_LOOPBACK); } } else - dev = dev_get_by_index_rcu(net, ifindex); + dev = __dev_get_by_index(net, ifindex); if (dev == NULL) { err = -ENODEV; @@ -137,7 +136,6 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) } error: - rcu_read_unlock(); rtnl_unlock(); if (pac) sock_kfree_s(sk, pac, sizeof(*pac)); @@ -174,11 +172,9 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) spin_unlock_bh(&ipv6_sk_ac_lock); rtnl_lock(); - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, pac->acl_ifindex); + dev = __dev_get_by_index(net, pac->acl_ifindex); if (dev) ipv6_dev_ac_dec(dev, &pac->acl_addr); - rcu_read_unlock(); rtnl_unlock(); sock_kfree_s(sk, pac, sizeof(*pac)); @@ -203,12 +199,11 @@ void ipv6_sock_ac_close(struct sock *sk) prev_index = 0; rtnl_lock(); - rcu_read_lock(); while (pac) { struct ipv6_ac_socklist *next = pac->acl_next; if (pac->acl_ifindex != prev_index) { - dev = dev_get_by_index_rcu(net, pac->acl_ifindex); + dev = __dev_get_by_index(net, pac->acl_ifindex); prev_index = pac->acl_ifindex; } if (dev) @@ -216,7 +211,6 @@ void ipv6_sock_ac_close(struct sock *sk) sock_kfree_s(sk, pac, sizeof(*pac)); pac = next; } - rcu_read_unlock(); rtnl_unlock(); } @@ -341,7 +335,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) return 0; } -/* called with rcu_read_lock() */ +/* called with rtnl_lock() */ static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr) { struct inet6_dev *idev = __in6_dev_get(dev); -- cgit v1.2.3-59-g8ed1b From 233577a22089facf5271ab5e845b2262047c971f Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Fri, 12 Sep 2014 14:04:43 +0200 Subject: net: filter: constify detection of pkt_type_offset Currently we have 2 pkt_type_offset functions doing the same thing and spread across the architecture files. Remove those and replace them with a PKT_TYPE_OFFSET macro helper which gets the constant value from a zero sized sk_buff member right in front of the bitfield with offsetof. This new offset marker does not change size of struct sk_buff. Cc: Eric Dumazet Cc: Markos Chandras Cc: Martin Schwidefsky Cc: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Denis Kirjanov Signed-off-by: Hannes Frederic Sowa Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- arch/mips/net/bpf_jit.c | 27 +-------------------------- arch/s390/net/bpf_jit_comp.c | 35 +---------------------------------- include/linux/skbuff.h | 10 ++++++++++ net/core/filter.c | 31 ++----------------------------- 4 files changed, 14 insertions(+), 89 deletions(-) (limited to 'include/linux') diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index 0e97ccd29fe3..7edc08398c4a 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -765,27 +765,6 @@ static u64 jit_get_skb_w(struct sk_buff *skb, unsigned offset) return (u64)err << 32 | ntohl(ret); } -#ifdef __BIG_ENDIAN_BITFIELD -#define PKT_TYPE_MAX (7 << 5) -#else -#define PKT_TYPE_MAX 7 -#endif -static int pkt_type_offset(void) -{ - struct sk_buff skb_probe = { - .pkt_type = ~0, - }; - u8 *ct = (u8 *)&skb_probe; - unsigned int off; - - for (off = 0; off < sizeof(struct sk_buff); off++) { - if (ct[off] == PKT_TYPE_MAX) - return off; - } - pr_err_once("Please fix pkt_type_offset(), as pkt_type couldn't be found\n"); - return -1; -} - static int build_body(struct jit_ctx *ctx) { void *load_func[] = {jit_get_skb_b, jit_get_skb_h, jit_get_skb_w}; @@ -1332,11 +1311,7 @@ jmp_cmp: case BPF_ANC | SKF_AD_PKTTYPE: ctx->flags |= SEEN_SKB; - off = pkt_type_offset(); - - if (off < 0) - return -1; - emit_load_byte(r_tmp, r_skb, off, ctx); + emit_load_byte(r_tmp, r_skb, PKT_TYPE_OFFSET(), ctx); /* Keep only the last 3 bits */ emit_andi(r_A, r_tmp, PKT_TYPE_MAX, ctx); #ifdef __BIG_ENDIAN_BITFIELD diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 555f5c7e83ab..c52ac77408ca 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -227,37 +227,6 @@ static void bpf_jit_epilogue(struct bpf_jit *jit) EMIT2(0x07fe); } -/* Helper to find the offset of pkt_type in sk_buff - * Make sure its still a 3bit field starting at the MSBs within a byte. - */ -#define PKT_TYPE_MAX 0xe0 -static int pkt_type_offset; - -static int __init bpf_pkt_type_offset_init(void) -{ - struct sk_buff skb_probe = { - .pkt_type = ~0, - }; - char *ct = (char *)&skb_probe; - int off; - - pkt_type_offset = -1; - for (off = 0; off < sizeof(struct sk_buff); off++) { - if (!ct[off]) - continue; - if (ct[off] == PKT_TYPE_MAX) - pkt_type_offset = off; - else { - /* Found non matching bit pattern, fix needed. */ - WARN_ON_ONCE(1); - pkt_type_offset = -1; - return -1; - } - } - return 0; -} -device_initcall(bpf_pkt_type_offset_init); - /* * make sure we dont leak kernel information to user */ @@ -757,12 +726,10 @@ call_fn: /* lg %r1,(%r13) */ } break; case BPF_ANC | SKF_AD_PKTTYPE: - if (pkt_type_offset < 0) - goto out; /* lhi %r5,0 */ EMIT4(0xa7580000); /* ic %r5,(%r2) */ - EMIT4_DISP(0x43502000, pkt_type_offset); + EMIT4_DISP(0x43502000, PKT_TYPE_OFFSET()); /* srl %r5,5 */ EMIT4_DISP(0x88500000, 5); break; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 07c9fdd0c126..756e3d057e84 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -548,6 +548,16 @@ struct sk_buff { ip_summed:2, nohdr:1, nfctinfo:3; + +/* if you move pkt_type around you also must adapt those constants */ +#ifdef __BIG_ENDIAN_BITFIELD +#define PKT_TYPE_MAX (7 << 5) +#else +#define PKT_TYPE_MAX 7 +#endif +#define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset) + + __u8 __pkt_type_offset[0]; __u8 pkt_type:3, fclone:2, ipvs_property:1, diff --git a/net/core/filter.c b/net/core/filter.c index dfc716ffa44b..601f28de7311 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -87,30 +87,6 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sk_filter); -/* Helper to find the offset of pkt_type in sk_buff structure. We want - * to make sure its still a 3bit field starting at a byte boundary; - * taken from arch/x86/net/bpf_jit_comp.c. - */ -#ifdef __BIG_ENDIAN_BITFIELD -#define PKT_TYPE_MAX (7 << 5) -#else -#define PKT_TYPE_MAX 7 -#endif -static unsigned int pkt_type_offset(void) -{ - struct sk_buff skb_probe = { .pkt_type = ~0, }; - u8 *ct = (u8 *) &skb_probe; - unsigned int off; - - for (off = 0; off < sizeof(struct sk_buff); off++) { - if (ct[off] == PKT_TYPE_MAX) - return off; - } - - pr_err_once("Please fix %s, as pkt_type couldn't be found!\n", __func__); - return -1; -} - static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) { return skb_get_poff((struct sk_buff *)(unsigned long) ctx); @@ -190,11 +166,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_PKTTYPE: - *insn = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX, - pkt_type_offset()); - if (insn->off < 0) - return false; - insn++; + *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX, + PKT_TYPE_OFFSET()); *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX); #ifdef __BIG_ENDIAN_BITFIELD insn++; -- cgit v1.2.3-59-g8ed1b From 3fc8867740b4a0bf56f372c6f5ddd14970962fb1 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 12 Sep 2014 23:12:46 -0700 Subject: netdevice: Support DSA tagging when DSA is built as a module This change corrects an error seen when DSA tagging is built as a module. Without this change it is not possible to get XDSA tagged frames as the test for tagging is stripped by the #ifdef check. Signed-off-by: Alexander Duyck Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ee38b948d9a0..f9e81d10a3b9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1789,7 +1789,7 @@ void dev_net_set(struct net_device *dev, struct net *net) static inline bool netdev_uses_dsa(struct net_device *dev) { -#ifdef CONFIG_NET_DSA +#if IS_ENABLED(CONFIG_NET_DSA) if (dev->dsa_ptr != NULL) return dsa_uses_tagged_protocol(dev->dsa_ptr); #endif -- cgit v1.2.3-59-g8ed1b From 0e9871e3f79fd17c691b50a9669220c54ff084a2 Mon Sep 17 00:00:00 2001 From: Anton Danilov Date: Thu, 28 Aug 2014 10:11:27 +0400 Subject: netfilter: ipset: Add skbinfo extension kernel support in the ipset core. Skbinfo extension provides mapping of metainformation with lookup in the ipset tables. This patch defines the flags, the constants, the functions and the structures for the data type independent support of the extension. Note the firewall mark stores in the kernel structures as two 32bit values, but transfered through netlink as one 64bit value. Signed-off-by: Anton Danilov Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set.h | 56 ++++++++++++++++++++++++++++- include/uapi/linux/netfilter/ipset/ip_set.h | 12 +++++++ net/netfilter/ipset/ip_set_core.c | 27 +++++++++++++- 3 files changed, 93 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 96afc29184be..b97aac5142ed 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -57,6 +57,8 @@ enum ip_set_extension { IPSET_EXT_COUNTER = (1 << IPSET_EXT_BIT_COUNTER), IPSET_EXT_BIT_COMMENT = 2, IPSET_EXT_COMMENT = (1 << IPSET_EXT_BIT_COMMENT), + IPSET_EXT_BIT_SKBINFO = 3, + IPSET_EXT_SKBINFO = (1 << IPSET_EXT_BIT_SKBINFO), /* Mark set with an extension which needs to call destroy */ IPSET_EXT_BIT_DESTROY = 7, IPSET_EXT_DESTROY = (1 << IPSET_EXT_BIT_DESTROY), @@ -65,12 +67,14 @@ enum ip_set_extension { #define SET_WITH_TIMEOUT(s) ((s)->extensions & IPSET_EXT_TIMEOUT) #define SET_WITH_COUNTER(s) ((s)->extensions & IPSET_EXT_COUNTER) #define SET_WITH_COMMENT(s) ((s)->extensions & IPSET_EXT_COMMENT) +#define SET_WITH_SKBINFO(s) ((s)->extensions & IPSET_EXT_SKBINFO) #define SET_WITH_FORCEADD(s) ((s)->flags & IPSET_CREATE_FLAG_FORCEADD) /* Extension id, in size order */ enum ip_set_ext_id { IPSET_EXT_ID_COUNTER = 0, IPSET_EXT_ID_TIMEOUT, + IPSET_EXT_ID_SKBINFO, IPSET_EXT_ID_COMMENT, IPSET_EXT_ID_MAX, }; @@ -92,6 +96,10 @@ struct ip_set_ext { u64 packets; u64 bytes; u32 timeout; + u32 skbmark; + u32 skbmarkmask; + u32 skbprio; + u16 skbqueue; char *comment; }; @@ -104,6 +112,13 @@ struct ip_set_comment { char *str; }; +struct ip_set_skbinfo { + u32 skbmark; + u32 skbmarkmask; + u32 skbprio; + u16 skbqueue; +}; + struct ip_set; #define ext_timeout(e, s) \ @@ -112,7 +127,8 @@ struct ip_set; (struct ip_set_counter *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COUNTER]) #define ext_comment(e, s) \ (struct ip_set_comment *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COMMENT]) - +#define ext_skbinfo(e, s) \ +(struct ip_set_skbinfo *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_SKBINFO]) typedef int (*ipset_adtfn)(struct ip_set *set, void *value, const struct ip_set_ext *ext, @@ -256,6 +272,8 @@ ip_set_put_flags(struct sk_buff *skb, struct ip_set *set) cadt_flags |= IPSET_FLAG_WITH_COUNTERS; if (SET_WITH_COMMENT(set)) cadt_flags |= IPSET_FLAG_WITH_COMMENT; + if (SET_WITH_SKBINFO(set)) + cadt_flags |= IPSET_FLAG_WITH_SKBINFO; if (SET_WITH_FORCEADD(set)) cadt_flags |= IPSET_FLAG_WITH_FORCEADD; @@ -304,6 +322,39 @@ ip_set_update_counter(struct ip_set_counter *counter, } } +static inline void +ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo, + const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + mext->skbmark = skbinfo->skbmark; + mext->skbmarkmask = skbinfo->skbmarkmask; + mext->skbprio = skbinfo->skbprio; + mext->skbqueue = skbinfo->skbqueue; +} +static inline bool +ip_set_put_skbinfo(struct sk_buff *skb, struct ip_set_skbinfo *skbinfo) +{ + return nla_put_net64(skb, IPSET_ATTR_SKBMARK, + cpu_to_be64((u64)skbinfo->skbmark << 32 | + skbinfo->skbmarkmask)) || + nla_put_net32(skb, IPSET_ATTR_SKBPRIO, + cpu_to_be32(skbinfo->skbprio)) || + nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, + cpu_to_be16(skbinfo->skbqueue)); + +} + +static inline void +ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo, + const struct ip_set_ext *ext) +{ + skbinfo->skbmark = ext->skbmark; + skbinfo->skbmarkmask = ext->skbmarkmask; + skbinfo->skbprio = ext->skbprio; + skbinfo->skbqueue = ext->skbqueue; +} + static inline bool ip_set_put_counter(struct sk_buff *skb, struct ip_set_counter *counter) { @@ -497,6 +548,9 @@ ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, if (SET_WITH_COMMENT(set) && ip_set_put_comment(skb, ext_comment(e, set))) return -EMSGSIZE; + if (SET_WITH_SKBINFO(set) && + ip_set_put_skbinfo(skb, ext_skbinfo(e, set))) + return -EMSGSIZE; return 0; } diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index 78c2f2e79920..ca03119111a2 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -115,6 +115,9 @@ enum { IPSET_ATTR_BYTES, IPSET_ATTR_PACKETS, IPSET_ATTR_COMMENT, + IPSET_ATTR_SKBMARK, + IPSET_ATTR_SKBPRIO, + IPSET_ATTR_SKBQUEUE, __IPSET_ATTR_ADT_MAX, }; #define IPSET_ATTR_ADT_MAX (__IPSET_ATTR_ADT_MAX - 1) @@ -147,6 +150,7 @@ enum ipset_errno { IPSET_ERR_COUNTER, IPSET_ERR_COMMENT, IPSET_ERR_INVALID_MARKMASK, + IPSET_ERR_SKBINFO, /* Type specific error codes */ IPSET_ERR_TYPE_SPECIFIC = 4352, @@ -170,6 +174,12 @@ enum ipset_cmd_flags { IPSET_FLAG_MATCH_COUNTERS = (1 << IPSET_FLAG_BIT_MATCH_COUNTERS), IPSET_FLAG_BIT_RETURN_NOMATCH = 7, IPSET_FLAG_RETURN_NOMATCH = (1 << IPSET_FLAG_BIT_RETURN_NOMATCH), + IPSET_FLAG_BIT_MAP_SKBMARK = 8, + IPSET_FLAG_MAP_SKBMARK = (1 << IPSET_FLAG_BIT_MAP_SKBMARK), + IPSET_FLAG_BIT_MAP_SKBPRIO = 9, + IPSET_FLAG_MAP_SKBPRIO = (1 << IPSET_FLAG_BIT_MAP_SKBPRIO), + IPSET_FLAG_BIT_MAP_SKBQUEUE = 10, + IPSET_FLAG_MAP_SKBQUEUE = (1 << IPSET_FLAG_BIT_MAP_SKBQUEUE), IPSET_FLAG_CMD_MAX = 15, }; @@ -187,6 +197,8 @@ enum ipset_cadt_flags { IPSET_FLAG_WITH_COMMENT = (1 << IPSET_FLAG_BIT_WITH_COMMENT), IPSET_FLAG_BIT_WITH_FORCEADD = 5, IPSET_FLAG_WITH_FORCEADD = (1 << IPSET_FLAG_BIT_WITH_FORCEADD), + IPSET_FLAG_BIT_WITH_SKBINFO = 6, + IPSET_FLAG_WITH_SKBINFO = (1 << IPSET_FLAG_BIT_WITH_SKBINFO), IPSET_FLAG_CADT_MAX = 15, }; diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 4ca4e5ca6f57..26c795e6b57f 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -337,6 +337,12 @@ const struct ip_set_ext_type ip_set_extensions[] = { .len = sizeof(unsigned long), .align = __alignof__(unsigned long), }, + [IPSET_EXT_ID_SKBINFO] = { + .type = IPSET_EXT_SKBINFO, + .flag = IPSET_FLAG_WITH_SKBINFO, + .len = sizeof(struct ip_set_skbinfo), + .align = __alignof__(struct ip_set_skbinfo), + }, [IPSET_EXT_ID_COMMENT] = { .type = IPSET_EXT_COMMENT | IPSET_EXT_DESTROY, .flag = IPSET_FLAG_WITH_COMMENT, @@ -382,6 +388,7 @@ int ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext *ext) { + u64 fullmark; if (tb[IPSET_ATTR_TIMEOUT]) { if (!(set->extensions & IPSET_EXT_TIMEOUT)) return -IPSET_ERR_TIMEOUT; @@ -402,7 +409,25 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_COMMENT; ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]); } - + if (tb[IPSET_ATTR_SKBMARK]) { + if (!(set->extensions & IPSET_EXT_SKBINFO)) + return -IPSET_ERR_SKBINFO; + fullmark = be64_to_cpu(nla_get_be64(tb[IPSET_ATTR_SKBMARK])); + ext->skbmark = fullmark >> 32; + ext->skbmarkmask = fullmark & 0xffffffff; + } + if (tb[IPSET_ATTR_SKBPRIO]) { + if (!(set->extensions & IPSET_EXT_SKBINFO)) + return -IPSET_ERR_SKBINFO; + ext->skbprio = be32_to_cpu(nla_get_be32( + tb[IPSET_ATTR_SKBPRIO])); + } + if (tb[IPSET_ATTR_SKBQUEUE]) { + if (!(set->extensions & IPSET_EXT_SKBINFO)) + return -IPSET_ERR_SKBINFO; + ext->skbqueue = be16_to_cpu(nla_get_be16( + tb[IPSET_ATTR_SKBQUEUE])); + } return 0; } EXPORT_SYMBOL_GPL(ip_set_get_extensions); -- cgit v1.2.3-59-g8ed1b From aef96193fe7b2791c4a3b19fe75426b929769471 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 15 Sep 2014 17:30:54 +0200 Subject: netfilter: ipset: send nonzero skbinfo extensions only Do not send zero valued skbinfo extensions to userspace at listing. Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index b97aac5142ed..f1606fa6132d 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -335,13 +335,17 @@ ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo, static inline bool ip_set_put_skbinfo(struct sk_buff *skb, struct ip_set_skbinfo *skbinfo) { - return nla_put_net64(skb, IPSET_ATTR_SKBMARK, - cpu_to_be64((u64)skbinfo->skbmark << 32 | - skbinfo->skbmarkmask)) || - nla_put_net32(skb, IPSET_ATTR_SKBPRIO, - cpu_to_be32(skbinfo->skbprio)) || - nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, - cpu_to_be16(skbinfo->skbqueue)); + /* Send nonzero parameters only */ + return ((skbinfo->skbmark || skbinfo->skbmarkmask) && + nla_put_net64(skb, IPSET_ATTR_SKBMARK, + cpu_to_be64((u64)skbinfo->skbmark << 32 | + skbinfo->skbmarkmask))) || + (skbinfo->skbprio && + nla_put_net32(skb, IPSET_ATTR_SKBPRIO, + cpu_to_be32(skbinfo->skbprio))) || + (skbinfo->skbqueue && + nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, + cpu_to_be16(skbinfo->skbqueue))); } -- cgit v1.2.3-59-g8ed1b From 5075314e4e4b559cc37675ad8a721a89bccd6284 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 15 Sep 2014 13:00:19 -0400 Subject: dsa: Split ops up, and avoid assigning tag_protocol and receive separately This change addresses several issues. First, it was possible to set tag_protocol without setting the ops pointer. To correct that I have reordered things so that rcv is now populated before we set tag_protocol. Second, it didn't make much sense to keep setting the device ops each time a new slave was registered. So by moving the receive portion out into root switch initialization that issue should be addressed. Third, I wanted to avoid sending tags if the rcv pointer was not registered so I changed the tag check to verify if the rcv function pointer is set on the root tree. If it is then we start sending DSA tagged frames. Finally I split the device ops pointer in the structures into two spots. I placed the rcv function pointer in the root switch since this makes it easiest to access from there, and I placed the xmit function pointer in the slave for the same reason. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 ------- include/net/dsa.h | 10 ++++++---- net/dsa/dsa.c | 32 ++++++++++++++++++++++++++++---- net/dsa/dsa_priv.h | 11 ++++++++++- net/dsa/slave.c | 37 +++++++++++++++---------------------- net/dsa/tag_brcm.c | 1 - net/dsa/tag_dsa.c | 1 - net/dsa/tag_edsa.c | 1 - net/dsa/tag_trailer.c | 1 - 9 files changed, 59 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f9e81d10a3b9..28d4378615e5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1928,13 +1928,6 @@ struct udp_offload { struct offload_callbacks callbacks; }; -struct dsa_device_ops { - netdev_tx_t (*xmit)(struct sk_buff *skb, struct net_device *dev); - int (*rcv)(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev); -}; - - /* often modified stats are per cpu, other are shared (netdev->stats) */ struct pcpu_sw_netstats { u64 rx_packets; diff --git a/include/net/dsa.h b/include/net/dsa.h index 8a8a5d976f97..a55c4e6a4f0f 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -77,7 +77,7 @@ struct dsa_platform_data { struct dsa_chip_data *chip; }; -struct dsa_device_ops; +struct packet_type; struct dsa_switch_tree { /* @@ -91,7 +91,10 @@ struct dsa_switch_tree { * protocol to use. */ struct net_device *master_netdev; - const struct dsa_device_ops *ops; + int (*rcv)(struct sk_buff *skb, + struct net_device *dev, + struct packet_type *pt, + struct net_device *orig_dev); enum dsa_tag_protocol tag_protocol; /* @@ -218,7 +221,6 @@ static inline void *ds_to_priv(struct dsa_switch *ds) static inline bool dsa_uses_tagged_protocol(struct dsa_switch_tree *dst) { - return dst->tag_protocol != DSA_TAG_PROTO_NONE; + return dst->rcv != NULL; } - #endif diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 61f145c44555..1df0a7cf1e9e 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -10,7 +10,6 @@ */ #include -#include #include #include #include @@ -154,9 +153,34 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, * tagging protocol to the preferred tagging format of this * switch. */ - if (ds->dst->cpu_switch == index) - ds->dst->tag_protocol = drv->tag_protocol; + if (dst->cpu_switch == index) { + switch (drv->tag_protocol) { +#ifdef CONFIG_NET_DSA_TAG_DSA + case DSA_TAG_PROTO_DSA: + dst->rcv = dsa_netdev_ops.rcv; + break; +#endif +#ifdef CONFIG_NET_DSA_TAG_EDSA + case DSA_TAG_PROTO_EDSA: + dst->rcv = edsa_netdev_ops.rcv; + break; +#endif +#ifdef CONFIG_NET_DSA_TAG_TRAILER + case DSA_TAG_PROTO_TRAILER: + dst->rcv = trailer_netdev_ops.rcv; + break; +#endif +#ifdef CONFIG_NET_DSA_TAG_BRCM + case DSA_TAG_PROTO_BRCM: + dst->rcv = brcm_netdev_ops.rcv; + break; +#endif + default: + break; + } + dst->tag_protocol = drv->tag_protocol; + } /* * Do basic register setup. @@ -626,7 +650,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, return 0; } - return dst->ops->rcv(skb, dev, pt, orig_dev); + return dst->rcv(skb, dev, pt, orig_dev); } static struct packet_type dsa_pack_type __read_mostly = { diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 98afed4d92ba..f90899e8ab5a 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -12,7 +12,13 @@ #define __DSA_PRIV_H #include -#include +#include + +struct dsa_device_ops { + netdev_tx_t (*xmit)(struct sk_buff *skb, struct net_device *dev); + int (*rcv)(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev); +}; struct dsa_slave_priv { /* @@ -20,6 +26,8 @@ struct dsa_slave_priv { * switch port. */ struct net_device *dev; + netdev_tx_t (*xmit)(struct sk_buff *skb, + struct net_device *dev); /* * Which switch this port is a part of, and the port index @@ -43,6 +51,7 @@ struct dsa_slave_priv { extern char dsa_driver_version[]; /* slave.c */ +extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); struct net_device *dsa_slave_create(struct dsa_switch *ds, struct device *parent, diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 809eeb13eb12..e38a331111c0 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -9,7 +9,6 @@ */ #include -#include #include #include #include @@ -176,9 +175,8 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch_tree *dst = p->parent->dst; - return dst->ops->xmit(skb, dev); + return p->xmit(skb, dev); } static netdev_tx_t dsa_slave_notag_xmit(struct sk_buff *skb, @@ -325,11 +323,6 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_do_ioctl = dsa_slave_ioctl, }; -static const struct dsa_device_ops notag_netdev_ops = { - .xmit = dsa_slave_notag_xmit, - .rcv = NULL, -}; - static void dsa_slave_adjust_link(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -435,41 +428,41 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, slave_dev->tx_queue_len = 0; slave_dev->netdev_ops = &dsa_slave_netdev_ops; + SET_NETDEV_DEV(slave_dev, parent); + slave_dev->dev.of_node = ds->pd->port_dn[port]; + slave_dev->vlan_features = master->vlan_features; + + p = netdev_priv(slave_dev); + p->dev = slave_dev; + p->parent = ds; + p->port = port; + switch (ds->dst->tag_protocol) { #ifdef CONFIG_NET_DSA_TAG_DSA case DSA_TAG_PROTO_DSA: - ds->dst->ops = &dsa_netdev_ops; + p->xmit = dsa_netdev_ops.xmit; break; #endif #ifdef CONFIG_NET_DSA_TAG_EDSA case DSA_TAG_PROTO_EDSA: - ds->dst->ops = &edsa_netdev_ops; + p->xmit = edsa_netdev_ops.xmit; break; #endif #ifdef CONFIG_NET_DSA_TAG_TRAILER case DSA_TAG_PROTO_TRAILER: - ds->dst->ops = &trailer_netdev_ops; + p->xmit = trailer_netdev_ops.xmit; break; #endif #ifdef CONFIG_NET_DSA_TAG_BRCM case DSA_TAG_PROTO_BRCM: - ds->dst->ops = &brcm_netdev_ops; + p->xmit = brcm_netdev_ops.xmit; break; #endif default: - ds->dst->ops = ¬ag_netdev_ops; + p->xmit = dsa_slave_notag_xmit; break; } - SET_NETDEV_DEV(slave_dev, parent); - slave_dev->dev.of_node = ds->pd->port_dn[port]; - slave_dev->vlan_features = master->vlan_features; - - p = netdev_priv(slave_dev); - p->dev = slave_dev; - p->parent = ds; - p->port = port; - p->old_pause = -1; p->old_link = -1; p->old_duplex = -1; diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 8fbc21c0de78..83d3572cdb20 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -11,7 +11,6 @@ #include #include -#include #include #include "dsa_priv.h" diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index d7dbc5bda5c0..ce90c8bdc658 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -10,7 +10,6 @@ #include #include -#include #include #include "dsa_priv.h" diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 6b30abe89183..94fcce778679 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -10,7 +10,6 @@ #include #include -#include #include #include "dsa_priv.h" diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 5fe9444842c5..115fdca34077 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -10,7 +10,6 @@ #include #include -#include #include #include "dsa_priv.h" -- cgit v1.2.3-59-g8ed1b From 2e4e44107176d552f8bb1bb76053e850e3809841 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Sep 2014 04:49:49 -0700 Subject: net: add alloc_skb_with_frags() helper Extract from sock_alloc_send_pskb() code building skb with frags, so that we can reuse this in other contexts. Intent is to use it from tcp_send_rcvq(), tcp_collapse(), ... We also want to replace some skb_linearize() calls to a more reliable strategy in pathological cases where we need to reduce number of frags. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 ++++ net/core/skbuff.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++ net/core/sock.c | 78 ++++++++++---------------------------------------- 3 files changed, 99 insertions(+), 63 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 756e3d057e84..f1bfa3781c75 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -769,6 +769,12 @@ static inline struct sk_buff *alloc_skb(unsigned int size, return __alloc_skb(size, priority, 0, NUMA_NO_NODE); } +struct sk_buff *alloc_skb_with_frags(unsigned long header_len, + unsigned long data_len, + int max_page_order, + int *errcode, + gfp_t gfp_mask); + static inline struct sk_buff *alloc_skb_fclone(unsigned int size, gfp_t priority) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 29f7f0121491..06a8feb10099 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4102,3 +4102,81 @@ err_free: return NULL; } EXPORT_SYMBOL(skb_vlan_untag); + +/** + * alloc_skb_with_frags - allocate skb with page frags + * + * header_len: size of linear part + * data_len: needed length in frags + * max_page_order: max page order desired. + * errcode: pointer to error code if any + * gfp_mask: allocation mask + * + * This can be used to allocate a paged skb, given a maximal order for frags. + */ +struct sk_buff *alloc_skb_with_frags(unsigned long header_len, + unsigned long data_len, + int max_page_order, + int *errcode, + gfp_t gfp_mask) +{ + int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + unsigned long chunk; + struct sk_buff *skb; + struct page *page; + gfp_t gfp_head; + int i; + + *errcode = -EMSGSIZE; + /* Note this test could be relaxed, if we succeed to allocate + * high order pages... + */ + if (npages > MAX_SKB_FRAGS) + return NULL; + + gfp_head = gfp_mask; + if (gfp_head & __GFP_WAIT) + gfp_head |= __GFP_REPEAT; + + *errcode = -ENOBUFS; + skb = alloc_skb(header_len, gfp_head); + if (!skb) + return NULL; + + skb->truesize += npages << PAGE_SHIFT; + + for (i = 0; npages > 0; i++) { + int order = max_page_order; + + while (order) { + if (npages >= 1 << order) { + page = alloc_pages(gfp_mask | + __GFP_COMP | + __GFP_NOWARN | + __GFP_NORETRY, + order); + if (page) + goto fill_page; + /* Do not retry other high order allocations */ + order = 1; + max_page_order = 0; + } + order--; + } + page = alloc_page(gfp_mask); + if (!page) + goto failure; +fill_page: + chunk = min_t(unsigned long, data_len, + PAGE_SIZE << order); + skb_fill_page_desc(skb, i, page, 0, chunk); + data_len -= chunk; + npages -= 1 << order; + } + return skb; + +failure: + kfree_skb(skb); + return NULL; +} +EXPORT_SYMBOL(alloc_skb_with_frags); diff --git a/net/core/sock.c b/net/core/sock.c index 6f436b5e4961..de887c45c63b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1762,21 +1762,12 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, int *errcode, int max_page_order) { - struct sk_buff *skb = NULL; - unsigned long chunk; - gfp_t gfp_mask; + struct sk_buff *skb; long timeo; int err; - int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; - struct page *page; - int i; - - err = -EMSGSIZE; - if (npages > MAX_SKB_FRAGS) - goto failure; timeo = sock_sndtimeo(sk, noblock); - while (!skb) { + for (;;) { err = sock_error(sk); if (err != 0) goto failure; @@ -1785,66 +1776,27 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) { - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - err = -EAGAIN; - if (!timeo) - goto failure; - if (signal_pending(current)) - goto interrupted; - timeo = sock_wait_for_wmem(sk, timeo); - continue; - } - - err = -ENOBUFS; - gfp_mask = sk->sk_allocation; - if (gfp_mask & __GFP_WAIT) - gfp_mask |= __GFP_REPEAT; + if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) + break; - skb = alloc_skb(header_len, gfp_mask); - if (!skb) + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + if (!timeo) goto failure; - - skb->truesize += data_len; - - for (i = 0; npages > 0; i++) { - int order = max_page_order; - - while (order) { - if (npages >= 1 << order) { - page = alloc_pages(sk->sk_allocation | - __GFP_COMP | - __GFP_NOWARN | - __GFP_NORETRY, - order); - if (page) - goto fill_page; - /* Do not retry other high order allocations */ - order = 1; - max_page_order = 0; - } - order--; - } - page = alloc_page(sk->sk_allocation); - if (!page) - goto failure; -fill_page: - chunk = min_t(unsigned long, data_len, - PAGE_SIZE << order); - skb_fill_page_desc(skb, i, page, 0, chunk); - data_len -= chunk; - npages -= 1 << order; - } + if (signal_pending(current)) + goto interrupted; + timeo = sock_wait_for_wmem(sk, timeo); } - - skb_set_owner_w(skb, sk); + skb = alloc_skb_with_frags(header_len, data_len, max_page_order, + errcode, sk->sk_allocation); + if (skb) + skb_set_owner_w(skb, sk); return skb; interrupted: err = sock_intr_errno(timeo); failure: - kfree_skb(skb); *errcode = err; return NULL; } -- cgit v1.2.3-59-g8ed1b From bb7d93496f7ac203f7c3e9678000d1c83eb4e0ba Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 19 Sep 2014 13:07:50 -0700 Subject: net: phy: broadcom: add helper for PHY revision and patch level The Broadcom BCM7xxx internal PHYs do not contain any useful revision information in the low 4-bits of their MII_PHYSID2 (MII register 3) which could allow us to properly identify them. As a result, we need the actual hardware block integrating these PHYs: GENET or the SF2 switch to tell us what revision they are built with. To assist with that, add two helper macros for fetching the the PHY revision and patch level from the struct phy_device::dev_flags. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 5bd35cc0d471..e4ee9c6679e8 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -41,6 +41,8 @@ #define PHY_BRCM_DIS_TXCRXC_NOENRGY 0x00008000 /* Broadcom BCM7xxx specific workarounds */ #define PHY_BRCM_100MBPS_WAR 0x00010000 +#define PHY_BRCM_7XXX_REV(x) (((x) >> 8) & 0xff) +#define PHY_BRCM_7XXX_PATCH(x) ((x) & 0xff) #define PHY_BCM_FLAGS_VALID 0x80000000 /* Broadcom BCM54XX register definitions, common to most Broadcom PHYs */ -- cgit v1.2.3-59-g8ed1b From 80780a54ecded1647e661ababde13554a149f7f3 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 19 Sep 2014 13:07:52 -0700 Subject: net: bcmgenet: remove PHY_BRCM_100MBPS_WAR Now that we have removed the need for the PHY_BRCM_100MBPS_WAR flag, we can remove it from the GENET driver and the broadcom shared header file. The PHY driver checks the PHY supported bitmask instead. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmmii.c | 10 ---------- include/linux/brcmphy.h | 1 - 2 files changed, 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c index c88f7ae99636..71f2ef7d33a9 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmmii.c +++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c @@ -296,7 +296,6 @@ static int bcmgenet_mii_probe(struct net_device *dev) struct bcmgenet_priv *priv = netdev_priv(dev); struct device_node *dn = priv->pdev->dev.of_node; struct phy_device *phydev; - unsigned int phy_flags; int ret; if (priv->phydev) { @@ -338,15 +337,6 @@ static int bcmgenet_mii_probe(struct net_device *dev) return ret; } - phy_flags = PHY_BRCM_100MBPS_WAR; - - /* workarounds are only needed for 100Mpbs PHYs, and - * never on GENET V1 hardware - */ - if ((phydev->supported & PHY_GBIT_FEATURES) || GENET_IS_V1(priv)) - phy_flags = 0; - - phydev->dev_flags |= phy_flags; phydev->advertising = phydev->supported; /* The internal PHY has its link interrupts routed to the diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index e4ee9c6679e8..3f626fe48c9a 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -40,7 +40,6 @@ #define PHY_BRCM_CLEAR_RGMII_MODE 0x00004000 #define PHY_BRCM_DIS_TXCRXC_NOENRGY 0x00008000 /* Broadcom BCM7xxx specific workarounds */ -#define PHY_BRCM_100MBPS_WAR 0x00010000 #define PHY_BRCM_7XXX_REV(x) (((x) >> 8) & 0xff) #define PHY_BRCM_7XXX_PATCH(x) ((x) & 0xff) #define PHY_BCM_FLAGS_VALID 0x80000000 -- cgit v1.2.3-59-g8ed1b From afe93325bc02a5b2dea0cd7d78225de692265e6e Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 17 Sep 2014 12:25:57 -0700 Subject: fou: Add GRO support Implement fou_gro_receive and fou_gro_complete, and populate these in the correponsing udp_offloads for the socket. Added ipproto to udp_offloads and pass this from UDP to the fou GRO routine in proto field of napi_gro_cb structure. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +- net/ipv4/fou.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/udp_offload.c | 5 ++- 3 files changed, 95 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 28d4378615e5..4354b4307e37 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1874,7 +1874,7 @@ struct napi_gro_cb { /* jiffies when first packet was created/queued */ unsigned long age; - /* Used in ipv6_gro_receive() */ + /* Used in ipv6_gro_receive() and foo-over-udp */ u16 proto; /* Used in udp_gro_receive */ @@ -1925,6 +1925,7 @@ struct packet_offload { struct udp_offload { __be16 port; + u8 ipproto; struct offload_callbacks callbacks; }; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index d44f97b79418..dced89fbe480 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -21,6 +22,7 @@ struct fou { struct socket *sock; u8 protocol; u16 port; + struct udp_offload udp_offloads; struct list_head list; }; @@ -62,6 +64,69 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) sizeof(struct udphdr)); } +static struct sk_buff **fou_gro_receive(struct sk_buff **head, + struct sk_buff *skb, + const struct net_offload **offloads) +{ + const struct net_offload *ops; + struct sk_buff **pp = NULL; + u8 proto = NAPI_GRO_CB(skb)->proto; + + rcu_read_lock(); + ops = rcu_dereference(offloads[proto]); + if (!ops || !ops->callbacks.gro_receive) + goto out_unlock; + + pp = ops->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); + + return pp; +} + +static int fou_gro_complete(struct sk_buff *skb, int nhoff, + const struct net_offload **offloads) +{ + const struct net_offload *ops; + u8 proto = NAPI_GRO_CB(skb)->proto; + int err = -ENOSYS; + + rcu_read_lock(); + ops = rcu_dereference(offloads[proto]); + if (WARN_ON(!ops || !ops->callbacks.gro_complete)) + goto out_unlock; + + err = ops->callbacks.gro_complete(skb, nhoff); + +out_unlock: + rcu_read_unlock(); + + return err; +} + +static struct sk_buff **fou4_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + return fou_gro_receive(head, skb, inet_offloads); +} + +static int fou4_gro_complete(struct sk_buff *skb, int nhoff) +{ + return fou_gro_complete(skb, nhoff, inet_offloads); +} + +static struct sk_buff **fou6_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + return fou_gro_receive(head, skb, inet6_offloads); +} + +static int fou6_gro_complete(struct sk_buff *skb, int nhoff) +{ + return fou_gro_complete(skb, nhoff, inet6_offloads); +} + static int fou_add_to_port_list(struct fou *fou) { struct fou *fout; @@ -134,6 +199,29 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, sk->sk_allocation = GFP_ATOMIC; + switch (cfg->udp_config.family) { + case AF_INET: + fou->udp_offloads.callbacks.gro_receive = fou4_gro_receive; + fou->udp_offloads.callbacks.gro_complete = fou4_gro_complete; + break; + case AF_INET6: + fou->udp_offloads.callbacks.gro_receive = fou6_gro_receive; + fou->udp_offloads.callbacks.gro_complete = fou6_gro_complete; + break; + default: + err = -EPFNOSUPPORT; + goto error; + } + + fou->udp_offloads.port = cfg->udp_config.local_udp_port; + fou->udp_offloads.ipproto = cfg->protocol; + + if (cfg->udp_config.family == AF_INET) { + err = udp_add_offload(&fou->udp_offloads); + if (err) + goto error; + } + err = fou_add_to_port_list(fou); if (err) goto error; @@ -160,6 +248,7 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg) spin_lock(&fou_lock); list_for_each_entry(fou, &fou_list, list) { if (fou->port == port) { + udp_del_offload(&fou->udp_offloads); fou_release(fou); err = 0; break; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index adab393b2fe5..d7c43f764c71 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -276,6 +276,7 @@ unflush: skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); + NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; pp = uo_priv->offload->callbacks.gro_receive(head, skb); out_unlock: @@ -329,8 +330,10 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff) break; } - if (uo_priv != NULL) + if (uo_priv != NULL) { + NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr)); + } rcu_read_unlock(); return err; -- cgit v1.2.3-59-g8ed1b From 77507aa249aecd06fa25ad058b64481e46887a01 Mon Sep 17 00:00:00 2001 From: Ido Shamay Date: Thu, 18 Sep 2014 11:50:59 +0300 Subject: net/mlx4_core: Enable CQE/EQE stride support This feature is intended for archs having cache line larger then 64B. Since our CQE/EQEs are generally 64B in those systems, HW will write twice to the same cache line consecutively, causing pipe locks due to he hazard prevention mechanism. For elements in a cyclic buffer, writes are consecutive, so entries smaller than a cache line should be avoided, especially if they are written at a high rate. Reduce consecutive writes to same cache line in CQs/EQs, by allowing the driver to increase the distance between entries so that each will reside in a different cache line. Until the introduction of this feature, there were two types of CQE/EQE: 1. 32B stride and context in the [0-31] segment 2. 64B stride and context in the [32-63] segment This feature introduces two additional types: 3. 128B stride and context in the [0-31] segment (128B cache line) 4. 256B stride and context in the [0-31] segment (256B cache line) Modify the mlx4_core driver to query the device for the CQE/EQE cache line stride capability and to enable that capability when the host cache line size is larger than 64 bytes (supported cache lines are 128B and 256B). The mlx4 IB driver and libmlx4 need not be aware of this change. The PF context behaviour is changed to require this change in VF drivers running on such archs. Signed-off-by: Ido Shamay Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/fw.c | 38 ++++++++++++++++++- drivers/net/ethernet/mellanox/mlx4/fw.h | 2 + drivers/net/ethernet/mellanox/mlx4/main.c | 61 ++++++++++++++++++++++++++++++- drivers/net/ethernet/mellanox/mlx4/mlx4.h | 3 ++ include/linux/mlx4/device.h | 11 ++++-- 5 files changed, 108 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index 494753e44ae3..13b2e4a51ef4 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -137,7 +137,9 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags) [8] = "Dynamic QP updates support", [9] = "Device managed flow steering IPoIB support", [10] = "TCP/IP offloads/flow-steering for VXLAN support", - [11] = "MAD DEMUX (Secure-Host) support" + [11] = "MAD DEMUX (Secure-Host) support", + [12] = "Large cache line (>64B) CQE stride support", + [13] = "Large cache line (>64B) EQE stride support" }; int i; @@ -557,6 +559,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET 0x74 #define QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET 0x76 #define QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET 0x77 +#define QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE 0x7a #define QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET 0x80 #define QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET 0x82 #define QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET 0x84 @@ -733,6 +736,11 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev_cap->max_rq_sg = field; MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET); dev_cap->max_rq_desc_sz = size; + MLX4_GET(field, outbox, QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE); + if (field & (1 << 6)) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_CQE_STRIDE; + if (field & (1 << 7)) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_EQE_STRIDE; MLX4_GET(dev_cap->bmme_flags, outbox, QUERY_DEV_CAP_BMME_FLAGS_OFFSET); @@ -1376,6 +1384,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) #define INIT_HCA_CQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x30) #define INIT_HCA_LOG_CQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x37) #define INIT_HCA_EQE_CQE_OFFSETS (INIT_HCA_QPC_OFFSET + 0x38) +#define INIT_HCA_EQE_CQE_STRIDE_OFFSET (INIT_HCA_QPC_OFFSET + 0x3b) #define INIT_HCA_ALTC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x40) #define INIT_HCA_AUXC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x50) #define INIT_HCA_EQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x60) @@ -1452,11 +1461,25 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) if (dev->caps.flags & MLX4_DEV_CAP_FLAG_64B_CQE) { *(inbox + INIT_HCA_EQE_CQE_OFFSETS / 4) |= cpu_to_be32(1 << 30); dev->caps.cqe_size = 64; - dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_64B_CQE; + dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE; } else { dev->caps.cqe_size = 32; } + /* CX3 is capable of extending CQEs\EQEs to strides larger than 64B */ + if ((dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_EQE_STRIDE) && + (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_CQE_STRIDE)) { + dev->caps.eqe_size = cache_line_size(); + dev->caps.cqe_size = cache_line_size(); + dev->caps.eqe_factor = 0; + MLX4_PUT(inbox, (u8)((ilog2(dev->caps.eqe_size) - 5) << 4 | + (ilog2(dev->caps.eqe_size) - 5)), + INIT_HCA_EQE_CQE_STRIDE_OFFSET); + + /* User still need to know to support CQE > 32B */ + dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE; + } + /* QPC/EEC/CQC/EQC/RDMARC attributes */ MLX4_PUT(inbox, param->qpc_base, INIT_HCA_QPC_BASE_OFFSET); @@ -1616,6 +1639,17 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev, if (byte_field & 0x40) /* 64-bytes cqe enabled */ param->dev_cap_enabled |= MLX4_DEV_CAP_64B_CQE_ENABLED; + /* CX3 is capable of extending CQEs\EQEs to strides larger than 64B */ + MLX4_GET(byte_field, outbox, INIT_HCA_EQE_CQE_STRIDE_OFFSET); + if (byte_field) { + param->dev_cap_enabled |= MLX4_DEV_CAP_64B_EQE_ENABLED; + param->dev_cap_enabled |= MLX4_DEV_CAP_64B_CQE_ENABLED; + param->cqe_size = 1 << ((byte_field & + MLX4_CQE_SIZE_MASK_STRIDE) + 5); + param->eqe_size = 1 << (((byte_field & + MLX4_EQE_SIZE_MASK_STRIDE) >> 4) + 5); + } + /* TPT attributes */ MLX4_GET(param->dmpt_base, outbox, INIT_HCA_DMPT_BASE_OFFSET); diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h index 1fce03ebe5c4..9b835aecac96 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.h +++ b/drivers/net/ethernet/mellanox/mlx4/fw.h @@ -178,6 +178,8 @@ struct mlx4_init_hca_param { u8 uar_page_sz; /* log pg sz in 4k chunks */ u8 steering_mode; /* for QUERY_HCA */ u64 dev_cap_enabled; + u16 cqe_size; /* For use only when CQE stride feature enabled */ + u16 eqe_size; /* For use only when EQE stride feature enabled */ }; struct mlx4_init_ib_param { diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 7e2d5d57c598..1f10023af1db 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -104,7 +104,8 @@ module_param(enable_64b_cqe_eqe, bool, 0444); MODULE_PARM_DESC(enable_64b_cqe_eqe, "Enable 64 byte CQEs/EQEs when the FW supports this (default: True)"); -#define PF_CONTEXT_BEHAVIOUR_MASK MLX4_FUNC_CAP_64B_EQE_CQE +#define PF_CONTEXT_BEHAVIOUR_MASK (MLX4_FUNC_CAP_64B_EQE_CQE | \ + MLX4_FUNC_CAP_EQE_CQE_STRIDE) static char mlx4_version[] = DRV_NAME ": Mellanox ConnectX core driver v" @@ -196,6 +197,40 @@ static void mlx4_set_port_mask(struct mlx4_dev *dev) dev->caps.port_mask[i] = dev->caps.port_type[i]; } +static void mlx4_enable_cqe_eqe_stride(struct mlx4_dev *dev) +{ + struct mlx4_caps *dev_cap = &dev->caps; + + /* FW not supporting or cancelled by user */ + if (!(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_EQE_STRIDE) || + !(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_CQE_STRIDE)) + return; + + /* Must have 64B CQE_EQE enabled by FW to use bigger stride + * When FW has NCSI it may decide not to report 64B CQE/EQEs + */ + if (!(dev_cap->flags & MLX4_DEV_CAP_FLAG_64B_EQE) || + !(dev_cap->flags & MLX4_DEV_CAP_FLAG_64B_CQE)) { + dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE; + dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE; + return; + } + + if (cache_line_size() == 128 || cache_line_size() == 256) { + mlx4_dbg(dev, "Enabling CQE stride cacheLine supported\n"); + /* Changing the real data inside CQE size to 32B */ + dev_cap->flags &= ~MLX4_DEV_CAP_FLAG_64B_CQE; + dev_cap->flags &= ~MLX4_DEV_CAP_FLAG_64B_EQE; + + if (mlx4_is_master(dev)) + dev_cap->function_caps |= MLX4_FUNC_CAP_EQE_CQE_STRIDE; + } else { + mlx4_dbg(dev, "Disabling CQE stride cacheLine unsupported\n"); + dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE; + dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE; + } +} + static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) { int err; @@ -390,6 +425,14 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_CQE; dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_EQE; } + + if (dev_cap->flags2 & + (MLX4_DEV_CAP_FLAG2_CQE_STRIDE | + MLX4_DEV_CAP_FLAG2_EQE_STRIDE)) { + mlx4_warn(dev, "Disabling EQE/CQE stride per user request\n"); + dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE; + dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE; + } } if ((dev->caps.flags & @@ -397,6 +440,9 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) mlx4_is_master(dev)) dev->caps.function_caps |= MLX4_FUNC_CAP_64B_EQE_CQE; + if (!mlx4_is_slave(dev)) + mlx4_enable_cqe_eqe_stride(dev); + return 0; } @@ -724,11 +770,22 @@ static int mlx4_slave_cap(struct mlx4_dev *dev) if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_64B_CQE_ENABLED) { dev->caps.cqe_size = 64; - dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_64B_CQE; + dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE; } else { dev->caps.cqe_size = 32; } + if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_EQE_STRIDE_ENABLED) { + dev->caps.eqe_size = hca_param.eqe_size; + dev->caps.eqe_factor = 0; + } + + if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_CQE_STRIDE_ENABLED) { + dev->caps.cqe_size = hca_param.cqe_size; + /* User still need to know when CQE > 32B */ + dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE; + } + dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS; mlx4_warn(dev, "Timestamping is not supported in slave mode\n"); diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index b508c7887ef8..de10dbb2e6ed 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -285,6 +285,9 @@ struct mlx4_icm_table { #define MLX4_MPT_STATUS_SW 0xF0 #define MLX4_MPT_STATUS_HW 0x00 +#define MLX4_CQE_SIZE_MASK_STRIDE 0x3 +#define MLX4_EQE_SIZE_MASK_STRIDE 0x30 + /* * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits. */ diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 1befd8df9cfc..7bcefe749a39 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -185,19 +185,24 @@ enum { MLX4_DEV_CAP_FLAG2_DMFS_IPOIB = 1LL << 9, MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS = 1LL << 10, MLX4_DEV_CAP_FLAG2_MAD_DEMUX = 1LL << 11, + MLX4_DEV_CAP_FLAG2_CQE_STRIDE = 1LL << 12, + MLX4_DEV_CAP_FLAG2_EQE_STRIDE = 1LL << 13 }; enum { MLX4_DEV_CAP_64B_EQE_ENABLED = 1LL << 0, - MLX4_DEV_CAP_64B_CQE_ENABLED = 1LL << 1 + MLX4_DEV_CAP_64B_CQE_ENABLED = 1LL << 1, + MLX4_DEV_CAP_CQE_STRIDE_ENABLED = 1LL << 2, + MLX4_DEV_CAP_EQE_STRIDE_ENABLED = 1LL << 3 }; enum { - MLX4_USER_DEV_CAP_64B_CQE = 1L << 0 + MLX4_USER_DEV_CAP_LARGE_CQE = 1L << 0 }; enum { - MLX4_FUNC_CAP_64B_EQE_CQE = 1L << 0 + MLX4_FUNC_CAP_64B_EQE_CQE = 1L << 0, + MLX4_FUNC_CAP_EQE_CQE_STRIDE = 1L << 1 }; -- cgit v1.2.3-59-g8ed1b From 53e50398968d43338c4d932114e68bc099fc5fbd Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sat, 20 Sep 2014 14:52:30 -0700 Subject: net: Remove gso_send_check as an offload callback The send_check logic was only interesting in cases of TCP offload and UDP UFO where the checksum needed to be initialized to the pseudo header checksum. Now we've moved that logic into the related gso_segment functions so gso_send_check is no longer needed. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 - net/core/dev.c | 10 ---------- net/ipv4/af_inet.c | 36 ------------------------------------ net/ipv4/gre_offload.c | 11 +++-------- net/ipv4/tcp_offload.c | 6 ------ net/ipv4/udp_offload.c | 6 ------ net/ipv6/ip6_offload.c | 27 --------------------------- net/ipv6/tcpv6_offload.c | 6 ------ net/ipv6/udp_offload.c | 6 ------ net/mpls/mpls_gso.c | 7 ------- 10 files changed, 3 insertions(+), 113 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4354b4307e37..9f5d293a0281 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1911,7 +1911,6 @@ struct packet_type { struct offload_callbacks { struct sk_buff *(*gso_segment)(struct sk_buff *skb, netdev_features_t features); - int (*gso_send_check)(struct sk_buff *skb); struct sk_buff **(*gro_receive)(struct sk_buff **head, struct sk_buff *skb); int (*gro_complete)(struct sk_buff *skb, int nhoff); diff --git a/net/core/dev.c b/net/core/dev.c index db0388607329..e2ced01c01e4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2422,16 +2422,6 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, rcu_read_lock(); list_for_each_entry_rcu(ptype, &offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { - if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { - int err; - - err = ptype->callbacks.gso_send_check(skb); - segs = ERR_PTR(err); - if (err || skb_gso_ok(skb, features)) - break; - __skb_push(skb, (skb->data - - skb_network_header(skb))); - } segs = ptype->callbacks.gso_segment(skb, features); break; } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 72011cc4c13b..28e589c5f32d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1197,40 +1197,6 @@ int inet_sk_rebuild_header(struct sock *sk) } EXPORT_SYMBOL(inet_sk_rebuild_header); -static int inet_gso_send_check(struct sk_buff *skb) -{ - const struct net_offload *ops; - const struct iphdr *iph; - int proto; - int ihl; - int err = -EINVAL; - - if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) - goto out; - - iph = ip_hdr(skb); - ihl = iph->ihl * 4; - if (ihl < sizeof(*iph)) - goto out; - - proto = iph->protocol; - - /* Warning: after this point, iph might be no longer valid */ - if (unlikely(!pskb_may_pull(skb, ihl))) - goto out; - __skb_pull(skb, ihl); - - skb_reset_transport_header(skb); - err = -EPROTONOSUPPORT; - - ops = rcu_dereference(inet_offloads[proto]); - if (likely(ops && ops->callbacks.gso_send_check)) - err = ops->callbacks.gso_send_check(skb); - -out: - return err; -} - static struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -1655,7 +1621,6 @@ static int ipv4_proc_init(void); static struct packet_offload ip_packet_offload __read_mostly = { .type = cpu_to_be16(ETH_P_IP), .callbacks = { - .gso_send_check = inet_gso_send_check, .gso_segment = inet_gso_segment, .gro_receive = inet_gro_receive, .gro_complete = inet_gro_complete, @@ -1664,7 +1629,6 @@ static struct packet_offload ip_packet_offload __read_mostly = { static const struct net_offload ipip_offload = { .callbacks = { - .gso_send_check = inet_gso_send_check, .gso_segment = inet_gso_segment, .gro_receive = inet_gro_receive, .gro_complete = inet_gro_complete, diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index d3fe2ac05167..a77729503071 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -15,13 +15,6 @@ #include #include -static int gre_gso_send_check(struct sk_buff *skb) -{ - if (!skb->encapsulation) - return -EINVAL; - return 0; -} - static struct sk_buff *gre_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -46,6 +39,9 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, SKB_GSO_IPIP))) goto out; + if (!skb->encapsulation) + goto out; + if (unlikely(!pskb_may_pull(skb, sizeof(*greh)))) goto out; @@ -256,7 +252,6 @@ static int gre_gro_complete(struct sk_buff *skb, int nhoff) static const struct net_offload gre_offload = { .callbacks = { - .gso_send_check = gre_gso_send_check, .gso_segment = gre_gso_segment, .gro_receive = gre_gro_receive, .gro_complete = gre_gro_complete, diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 7cd12b0458ff..5b90f2f447a5 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -288,11 +288,6 @@ int tcp_gro_complete(struct sk_buff *skb) } EXPORT_SYMBOL(tcp_gro_complete); -static int tcp_v4_gso_send_check(struct sk_buff *skb) -{ - return 0; -} - static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) { /* Don't bother verifying checksum if we're going to flush anyway. */ @@ -320,7 +315,6 @@ static int tcp4_gro_complete(struct sk_buff *skb, int thoff) static const struct net_offload tcpv4_offload = { .callbacks = { - .gso_send_check = tcp_v4_gso_send_check, .gso_segment = tcp4_gso_segment, .gro_receive = tcp4_gro_receive, .gro_complete = tcp4_gro_complete, diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 2918cc914824..19ebe6a39ddc 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -25,11 +25,6 @@ struct udp_offload_priv { struct udp_offload_priv __rcu *next; }; -static int udp4_ufo_send_check(struct sk_buff *skb) -{ - return 0; -} - struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features) { @@ -346,7 +341,6 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff) static const struct net_offload udpv4_offload = { .callbacks = { - .gso_send_check = udp4_ufo_send_check, .gso_segment = udp4_ufo_fragment, .gro_receive = udp4_gro_receive, .gro_complete = udp4_gro_complete, diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 9952f3fce30a..9034f76ae013 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -53,31 +53,6 @@ static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto) return proto; } -static int ipv6_gso_send_check(struct sk_buff *skb) -{ - const struct ipv6hdr *ipv6h; - const struct net_offload *ops; - int err = -EINVAL; - - if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) - goto out; - - ipv6h = ipv6_hdr(skb); - __skb_pull(skb, sizeof(*ipv6h)); - err = -EPROTONOSUPPORT; - - ops = rcu_dereference(inet6_offloads[ - ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr)]); - - if (likely(ops && ops->callbacks.gso_send_check)) { - skb_reset_transport_header(skb); - err = ops->callbacks.gso_send_check(skb); - } - -out: - return err; -} - static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -306,7 +281,6 @@ out_unlock: static struct packet_offload ipv6_packet_offload __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), .callbacks = { - .gso_send_check = ipv6_gso_send_check, .gso_segment = ipv6_gso_segment, .gro_receive = ipv6_gro_receive, .gro_complete = ipv6_gro_complete, @@ -315,7 +289,6 @@ static struct packet_offload ipv6_packet_offload __read_mostly = { static const struct net_offload sit_offload = { .callbacks = { - .gso_send_check = ipv6_gso_send_check, .gso_segment = ipv6_gso_segment, .gro_receive = ipv6_gro_receive, .gro_complete = ipv6_gro_complete, diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index 96253154db3a..c1ab77105b4c 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -15,11 +15,6 @@ #include #include "ip6_offload.h" -static int tcp_v6_gso_send_check(struct sk_buff *skb) -{ - return 0; -} - static struct sk_buff **tcp6_gro_receive(struct sk_buff **head, struct sk_buff *skb) { @@ -71,7 +66,6 @@ struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, } static const struct net_offload tcpv6_offload = { .callbacks = { - .gso_send_check = tcp_v6_gso_send_check, .gso_segment = tcp6_gso_segment, .gro_receive = tcp6_gro_receive, .gro_complete = tcp6_gro_complete, diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index e4af6437ea3b..212ebfc7973f 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -17,11 +17,6 @@ #include #include "ip6_offload.h" -static int udp6_ufo_send_check(struct sk_buff *skb) -{ - return 0; -} - static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, netdev_features_t features) { @@ -166,7 +161,6 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff) static const struct net_offload udpv6_offload = { .callbacks = { - .gso_send_check = udp6_ufo_send_check, .gso_segment = udp6_ufo_fragment, .gro_receive = udp6_gro_receive, .gro_complete = udp6_gro_complete, diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c index 6b38d083e1c9..e28ed2ef5b06 100644 --- a/net/mpls/mpls_gso.c +++ b/net/mpls/mpls_gso.c @@ -65,15 +65,9 @@ out: return segs; } -static int mpls_gso_send_check(struct sk_buff *skb) -{ - return 0; -} - static struct packet_offload mpls_mc_offload = { .type = cpu_to_be16(ETH_P_MPLS_MC), .callbacks = { - .gso_send_check = mpls_gso_send_check, .gso_segment = mpls_gso_segment, }, }; @@ -81,7 +75,6 @@ static struct packet_offload mpls_mc_offload = { static struct packet_offload mpls_uc_offload = { .type = cpu_to_be16(ETH_P_MPLS_UC), .callbacks = { - .gso_send_check = mpls_gso_send_check, .gso_segment = mpls_gso_segment, }, }; -- cgit v1.2.3-59-g8ed1b From 7276ca3fa23864133f5ee7431c51546d9b7f695f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 22 Sep 2014 13:28:16 +0200 Subject: netfilter: bridge: nf_bridge_copy_header as static inline in header Move nf_bridge_copy_header() as static inline in netfilter_bridge.h header file. This patch prepares the modularization of the br_netfilter code. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge.h | 48 +++++++++++++++++++++++++++++++--------- net/bridge/br_netfilter.c | 28 ----------------------- 2 files changed, 38 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index 8ab1c278b66d..fe996d59de64 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -24,16 +24,6 @@ enum nf_br_hook_priorities { #define BRNF_8021Q 0x10 #define BRNF_PPPoE 0x20 -/* Only used in br_forward.c */ -int nf_bridge_copy_header(struct sk_buff *skb); -static inline int nf_bridge_maybe_copy_header(struct sk_buff *skb) -{ - if (skb->nf_bridge && - skb->nf_bridge->mask & (BRNF_BRIDGED | BRNF_BRIDGED_DNAT)) - return nf_bridge_copy_header(skb); - return 0; -} - static inline unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) { switch (skb->protocol) { @@ -46,6 +36,44 @@ static inline unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) } } +static inline void nf_bridge_update_protocol(struct sk_buff *skb) +{ + if (skb->nf_bridge->mask & BRNF_8021Q) + skb->protocol = htons(ETH_P_8021Q); + else if (skb->nf_bridge->mask & BRNF_PPPoE) + skb->protocol = htons(ETH_P_PPP_SES); +} + +/* Fill in the header for fragmented IP packets handled by + * the IPv4 connection tracking code. + * + * Only used in br_forward.c + */ +static inline int nf_bridge_copy_header(struct sk_buff *skb) +{ + int err; + unsigned int header_size; + + nf_bridge_update_protocol(skb); + header_size = ETH_HLEN + nf_bridge_encap_header_len(skb); + err = skb_cow_head(skb, header_size); + if (err) + return err; + + skb_copy_to_linear_data_offset(skb, -header_size, + skb->nf_bridge->data, header_size); + __skb_push(skb, nf_bridge_encap_header_len(skb)); + return 0; +} + +static inline int nf_bridge_maybe_copy_header(struct sk_buff *skb) +{ + if (skb->nf_bridge && + skb->nf_bridge->mask & (BRNF_BRIDGED | BRNF_BRIDGED_DNAT)) + return nf_bridge_copy_header(skb); + return 0; +} + static inline unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) { if (unlikely(skb->nf_bridge->mask & BRNF_PPPoE)) diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index a615264cf01a..61929a7cd815 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -245,14 +245,6 @@ static inline void nf_bridge_save_header(struct sk_buff *skb) skb->nf_bridge->data, header_size); } -static inline void nf_bridge_update_protocol(struct sk_buff *skb) -{ - if (skb->nf_bridge->mask & BRNF_8021Q) - skb->protocol = htons(ETH_P_8021Q); - else if (skb->nf_bridge->mask & BRNF_PPPoE) - skb->protocol = htons(ETH_P_PPP_SES); -} - /* When handing a packet over to the IP layer * check whether we have a skb that is in the * expected format @@ -320,26 +312,6 @@ drop: return -1; } -/* Fill in the header for fragmented IP packets handled by - * the IPv4 connection tracking code. - */ -int nf_bridge_copy_header(struct sk_buff *skb) -{ - int err; - unsigned int header_size; - - nf_bridge_update_protocol(skb); - header_size = ETH_HLEN + nf_bridge_encap_header_len(skb); - err = skb_cow_head(skb, header_size); - if (err) - return err; - - skb_copy_to_linear_data_offset(skb, -header_size, - skb->nf_bridge->data, header_size); - __skb_push(skb, nf_bridge_encap_header_len(skb)); - return 0; -} - /* PF_BRIDGE/PRE_ROUTING *********************************************/ /* Undo the changes made for ip6tables PREROUTING and continue the * bridge PRE_ROUTING hook. */ -- cgit v1.2.3-59-g8ed1b From 34666d467cbf1e2e3c7bb15a63eccfb582cdd71f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 18 Sep 2014 11:29:03 +0200 Subject: netfilter: bridge: move br_netfilter out of the core Jesper reported that br_netfilter always registers the hooks since this is part of the bridge core. This harms performance for people that don't need this. This patch modularizes br_netfilter so it can be rmmod'ed, thus, the hooks can be unregistered. I think the bridge netfilter should have been a separated module since the beginning, Patrick agreed on that. Note that this is breaking compatibility for users that expect that bridge netfilter is going to be available after explicitly 'modprobe bridge' or via automatic load through brctl. However, the damage can be easily undone by modprobing br_netfilter. The bridge core also spots a message to provide a clue to people that didn't notice that this has been deprecated. On top of that, the plan is that nftables will not rely on this software layer, but integrate the connection tracking into the bridge layer to enable stateful filtering and NAT, which is was bridge netfilter users seem to require. This patch still keeps the fake_dst_ops in the bridge core, since this is required by when the bridge port is initialized. So we can safely modprobe/rmmod br_netfilter anytime. Signed-off-by: Pablo Neira Ayuso Acked-by: Florian Westphal --- include/linux/netfilter_bridge.h | 2 +- include/linux/skbuff.h | 12 ++--- include/net/neighbour.h | 2 +- include/net/netfilter/ipv4/nf_reject.h | 2 +- include/net/netfilter/ipv6/nf_reject.h | 2 +- net/Kconfig | 7 +-- net/bridge/Makefile | 5 +- net/bridge/br.c | 14 ++--- net/bridge/br_device.c | 4 +- net/bridge/br_forward.c | 2 + net/bridge/br_input.c | 1 + net/bridge/br_netfilter.c | 88 ++++++------------------------- net/bridge/br_netlink.c | 2 +- net/bridge/br_nf_core.c | 96 ++++++++++++++++++++++++++++++++++ net/bridge/br_private.h | 12 ++--- net/bridge/br_sysfs_br.c | 4 +- 16 files changed, 151 insertions(+), 104 deletions(-) create mode 100644 net/bridge/br_nf_core.c (limited to 'include/linux') diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index fe996d59de64..c755e4971fa3 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -15,7 +15,7 @@ enum nf_br_hook_priorities { NF_BR_PRI_LAST = INT_MAX, }; -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) #define BRNF_PKT_TYPE 0x01 #define BRNF_BRIDGED_DNAT 0x02 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 07c9fdd0c126..c4ff43f84573 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -156,7 +156,7 @@ struct nf_conntrack { }; #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct nf_bridge_info { atomic_t use; unsigned int mask; @@ -560,7 +560,7 @@ struct sk_buff { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack *nfct; #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct nf_bridge_info *nf_bridge; #endif @@ -2977,7 +2977,7 @@ static inline void nf_conntrack_get(struct nf_conntrack *nfct) atomic_inc(&nfct->use); } #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge) { if (nf_bridge && atomic_dec_and_test(&nf_bridge->use)) @@ -2995,7 +2995,7 @@ static inline void nf_reset(struct sk_buff *skb) nf_conntrack_put(skb->nfct); skb->nfct = NULL; #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nf_bridge_put(skb->nf_bridge); skb->nf_bridge = NULL; #endif @@ -3016,7 +3016,7 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src) nf_conntrack_get(src->nfct); dst->nfctinfo = src->nfctinfo; #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) dst->nf_bridge = src->nf_bridge; nf_bridge_get(src->nf_bridge); #endif @@ -3030,7 +3030,7 @@ static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src) #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(dst->nfct); #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nf_bridge_put(dst->nf_bridge); #endif __nf_copy(dst, src); diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 47f425464f84..f60558d0254c 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -373,7 +373,7 @@ static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) return 0; } -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) { unsigned int seq, hh_alen; diff --git a/include/net/netfilter/ipv4/nf_reject.h b/include/net/netfilter/ipv4/nf_reject.h index 931fbf812171..f713b5a31d62 100644 --- a/include/net/netfilter/ipv4/nf_reject.h +++ b/include/net/netfilter/ipv4/nf_reject.h @@ -98,7 +98,7 @@ static void nf_send_reset(struct sk_buff *oldskb, int hook) nf_ct_attach(nskb, oldskb); -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* If we use ip_local_out for bridged traffic, the MAC source on * the RST will be ours, instead of the destination's. This confuses * some routers/firewalls, and they drop the packet. So we need to diff --git a/include/net/netfilter/ipv6/nf_reject.h b/include/net/netfilter/ipv6/nf_reject.h index 710d17ed70b4..7a10cfcd8e33 100644 --- a/include/net/netfilter/ipv6/nf_reject.h +++ b/include/net/netfilter/ipv6/nf_reject.h @@ -147,7 +147,7 @@ static void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) nf_ct_attach(nskb, oldskb); -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* If we use ip6_local_out for bridged traffic, the MAC source on * the RST will be ours, instead of the destination's. This confuses * some routers/firewalls, and they drop the packet. So we need to diff --git a/net/Kconfig b/net/Kconfig index 4051fdfa4367..dc5d700d05e7 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -176,10 +176,11 @@ config NETFILTER_ADVANCED If unsure, say Y. config BRIDGE_NETFILTER - bool "Bridged IP/ARP packets filtering" - depends on BRIDGE && NETFILTER && INET + tristate "Bridged IP/ARP packets filtering" + depends on (BRIDGE || BRIDGE=n) + depends on NETFILTER && INET depends on NETFILTER_ADVANCED - default y + default m ---help--- Enabling this option will let arptables resp. iptables see bridged ARP resp. IP traffic. If you want a bridging firewall, you probably diff --git a/net/bridge/Makefile b/net/bridge/Makefile index 8590b942bffa..5e3eac5dc8b9 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -6,11 +6,12 @@ obj-$(CONFIG_BRIDGE) += bridge.o bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \ br_ioctl.o br_stp.o br_stp_bpdu.o \ - br_stp_if.o br_stp_timer.o br_netlink.o + br_stp_if.o br_stp_timer.o br_netlink.o \ + br_nf_core.o bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o -bridge-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o +obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o diff --git a/net/bridge/br.c b/net/bridge/br.c index 1a755a1e5410..44425aff7cba 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -161,7 +161,7 @@ static int __init br_init(void) if (err) goto err_out1; - err = br_netfilter_init(); + err = br_nf_core_init(); if (err) goto err_out2; @@ -179,11 +179,16 @@ static int __init br_init(void) br_fdb_test_addr_hook = br_fdb_test_addr; #endif + pr_info("bridge: automatic filtering via arp/ip/ip6tables has been " + "deprecated. Update your scripts to load br_netfilter if you " + "need this.\n"); + return 0; + err_out4: unregister_netdevice_notifier(&br_device_notifier); err_out3: - br_netfilter_fini(); + br_nf_core_fini(); err_out2: unregister_pernet_subsys(&br_net_ops); err_out1: @@ -196,20 +201,17 @@ err_out: static void __exit br_deinit(void) { stp_proto_unregister(&br_stp_proto); - br_netlink_fini(); unregister_netdevice_notifier(&br_device_notifier); brioctl_set(NULL); - unregister_pernet_subsys(&br_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ - br_netfilter_fini(); + br_nf_core_fini(); #if IS_ENABLED(CONFIG_ATM_LANE) br_fdb_test_addr_hook = NULL; #endif - br_fdb_fini(); } diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 568cccd39a3d..659cac15c0df 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -36,7 +36,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) u16 vid = 0; rcu_read_lock(); -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) { br_nf_pre_routing_finish_bridge_slow(skb); rcu_read_unlock(); @@ -167,7 +167,7 @@ static int br_change_mtu(struct net_device *dev, int new_mtu) dev->mtu = new_mtu; -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* remember the MTU in the rtable for PMTU */ dst_metric_set(&br->fake_rtable.dst, RTAX_MTU, new_mtu); #endif diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 056b67b0e277..992ec49a96aa 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -49,6 +49,7 @@ int br_dev_queue_push_xmit(struct sk_buff *skb) return 0; } +EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit); int br_forward_finish(struct sk_buff *skb) { @@ -56,6 +57,7 @@ int br_forward_finish(struct sk_buff *skb) br_dev_queue_push_xmit); } +EXPORT_SYMBOL_GPL(br_forward_finish); static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) { diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 366c43649079..6fd5522df696 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -140,6 +140,7 @@ drop: kfree_skb(skb); goto out; } +EXPORT_SYMBOL_GPL(br_handle_frame_finish); /* note: already called with rcu_read_lock */ static int br_handle_local_finish(struct sk_buff *skb) diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 61929a7cd815..97e43937aaca 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -111,66 +111,6 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb) pppoe_proto(skb) == htons(PPP_IPV6) && \ brnf_filter_pppoe_tagged) -static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) -{ -} - -static void fake_redirect(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb) -{ -} - -static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old) -{ - return NULL; -} - -static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst, - struct sk_buff *skb, - const void *daddr) -{ - return NULL; -} - -static unsigned int fake_mtu(const struct dst_entry *dst) -{ - return dst->dev->mtu; -} - -static struct dst_ops fake_dst_ops = { - .family = AF_INET, - .protocol = cpu_to_be16(ETH_P_IP), - .update_pmtu = fake_update_pmtu, - .redirect = fake_redirect, - .cow_metrics = fake_cow_metrics, - .neigh_lookup = fake_neigh_lookup, - .mtu = fake_mtu, -}; - -/* - * Initialize bogus route table used to keep netfilter happy. - * Currently, we fill in the PMTU entry because netfilter - * refragmentation needs it, and the rt_flags entry because - * ipt_REJECT needs it. Future netfilter modules might - * require us to fill additional fields. - */ -static const u32 br_dst_default_metrics[RTAX_MAX] = { - [RTAX_MTU - 1] = 1500, -}; - -void br_netfilter_rtable_init(struct net_bridge *br) -{ - struct rtable *rt = &br->fake_rtable; - - atomic_set(&rt->dst.__refcnt, 1); - rt->dst.dev = br->dev; - rt->dst.path = &rt->dst; - dst_init_metrics(&rt->dst, br_dst_default_metrics, true); - rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; - rt->dst.ops = &fake_dst_ops; -} - static inline struct rtable *bridge_parent_rtable(const struct net_device *dev) { struct net_bridge_port *port; @@ -1031,38 +971,42 @@ static struct ctl_table brnf_table[] = { }; #endif -int __init br_netfilter_init(void) +static int __init br_netfilter_init(void) { int ret; - ret = dst_entries_init(&fake_dst_ops); + ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); if (ret < 0) return ret; - ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); - if (ret < 0) { - dst_entries_destroy(&fake_dst_ops); - return ret; - } #ifdef CONFIG_SYSCTL brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table); if (brnf_sysctl_header == NULL) { printk(KERN_WARNING "br_netfilter: can't register to sysctl.\n"); - nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); - dst_entries_destroy(&fake_dst_ops); - return -ENOMEM; + ret = -ENOMEM; + goto err1; } #endif printk(KERN_NOTICE "Bridge firewalling registered\n"); return 0; +err1: + nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + return ret; } -void br_netfilter_fini(void) +static void __exit br_netfilter_fini(void) { nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); #ifdef CONFIG_SYSCTL unregister_net_sysctl_table(brnf_sysctl_header); #endif - dst_entries_destroy(&fake_dst_ops); } + +module_init(br_netfilter_init); +module_exit(br_netfilter_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Lennert Buytenhek "); +MODULE_AUTHOR("Bart De Schuymer "); +MODULE_DESCRIPTION("Linux ethernet netfilter firewall bridge"); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 90a91e137acc..0fa66b83685f 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -602,7 +602,7 @@ out_af: return err; } -void __exit br_netlink_fini(void) +void br_netlink_fini(void) { br_mdb_uninit(); rtnl_af_unregister(&br_af_ops); diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c new file mode 100644 index 000000000000..387cb3bd017c --- /dev/null +++ b/net/bridge/br_nf_core.c @@ -0,0 +1,96 @@ +/* + * Handle firewalling core + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * Bart De Schuymer + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Lennert dedicates this file to Kerstin Wurdinger. + */ + +#include +#include +#include +#include +#include + +#include "br_private.h" +#ifdef CONFIG_SYSCTL +#include +#endif + +static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ +} + +static void fake_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) +{ +} + +static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + return NULL; +} + +static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr) +{ + return NULL; +} + +static unsigned int fake_mtu(const struct dst_entry *dst) +{ + return dst->dev->mtu; +} + +static struct dst_ops fake_dst_ops = { + .family = AF_INET, + .protocol = cpu_to_be16(ETH_P_IP), + .update_pmtu = fake_update_pmtu, + .redirect = fake_redirect, + .cow_metrics = fake_cow_metrics, + .neigh_lookup = fake_neigh_lookup, + .mtu = fake_mtu, +}; + +/* + * Initialize bogus route table used to keep netfilter happy. + * Currently, we fill in the PMTU entry because netfilter + * refragmentation needs it, and the rt_flags entry because + * ipt_REJECT needs it. Future netfilter modules might + * require us to fill additional fields. + */ +static const u32 br_dst_default_metrics[RTAX_MAX] = { + [RTAX_MTU - 1] = 1500, +}; + +void br_netfilter_rtable_init(struct net_bridge *br) +{ + struct rtable *rt = &br->fake_rtable; + + atomic_set(&rt->dst.__refcnt, 1); + rt->dst.dev = br->dev; + rt->dst.path = &rt->dst; + dst_init_metrics(&rt->dst, br_dst_default_metrics, true); + rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; + rt->dst.ops = &fake_dst_ops; +} + +int __init br_nf_core_init(void) +{ + return dst_entries_init(&fake_dst_ops); +} + +void br_nf_core_fini(void) +{ + dst_entries_destroy(&fake_dst_ops); +} diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 62a7fa2e3569..d304d752c091 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -221,7 +221,7 @@ struct net_bridge struct pcpu_sw_netstats __percpu *stats; spinlock_t hash_lock; struct hlist_head hash[BR_HASH_SIZE]; -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct rtable fake_rtable; bool nf_call_iptables; bool nf_call_ip6tables; @@ -751,13 +751,13 @@ static inline int br_vlan_enabled(struct net_bridge *br) #endif /* br_netfilter.c */ -#ifdef CONFIG_BRIDGE_NETFILTER -int br_netfilter_init(void); -void br_netfilter_fini(void); +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) +int br_nf_core_init(void); +void br_nf_core_fini(void); void br_netfilter_rtable_init(struct net_bridge *); #else -#define br_netfilter_init() (0) -#define br_netfilter_fini() do { } while (0) +static inline int br_nf_core_init(void) { return 0; } +static inline void br_nf_core_fini(void) {} #define br_netfilter_rtable_init(x) #endif diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index c9e2572b15f4..cb431c6016ee 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -629,7 +629,7 @@ static ssize_t multicast_startup_query_interval_store( } static DEVICE_ATTR_RW(multicast_startup_query_interval); #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static ssize_t nf_call_iptables_show( struct device *d, struct device_attribute *attr, char *buf) { @@ -763,7 +763,7 @@ static struct attribute *bridge_attrs[] = { &dev_attr_multicast_query_response_interval.attr, &dev_attr_multicast_startup_query_interval.attr, #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) &dev_attr_nf_call_iptables.attr, &dev_attr_nf_call_ip6tables.attr, &dev_attr_nf_call_arptables.attr, -- cgit v1.2.3-59-g8ed1b From 99c55f7d47c0dc6fc64729f37bf435abf43f4c60 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:16:57 -0700 Subject: bpf: introduce BPF syscall and maps BPF syscall is a multiplexor for a range of different operations on eBPF. This patch introduces syscall with single command to create a map. Next patch adds commands to access maps. 'maps' is a generic storage of different types for sharing data between kernel and userspace. Userspace example: /* this syscall wrapper creates a map with given type and attributes * and returns map_fd on success. * use close(map_fd) to delete the map */ int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries) { union bpf_attr attr = { .map_type = map_type, .key_size = key_size, .value_size = value_size, .max_entries = max_entries }; return bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } 'union bpf_attr' is backwards compatible with future extensions. More details in Documentation/networking/filter.txt and in manpage Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- Documentation/networking/filter.txt | 39 +++++++++ include/linux/bpf.h | 41 +++++++++ include/uapi/linux/bpf.h | 23 +++++ kernel/bpf/Makefile | 2 +- kernel/bpf/syscall.c | 169 ++++++++++++++++++++++++++++++++++++ 5 files changed, 273 insertions(+), 1 deletion(-) create mode 100644 include/linux/bpf.h create mode 100644 kernel/bpf/syscall.c (limited to 'include/linux') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index 014e0319a5c4..4a01d71785e9 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -1001,6 +1001,45 @@ instruction that loads 64-bit immediate value into a dst_reg. Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads 32-bit immediate value into a register. +eBPF maps +--------- +'maps' is a generic storage of different types for sharing data between kernel +and userspace. + +The maps are accessed from user space via BPF syscall, which has commands: +- create a map with given type and attributes + map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size) + using attr->map_type, attr->key_size, attr->value_size, attr->max_entries + returns process-local file descriptor or negative error + +- lookup key in a given map + err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size) + using attr->map_fd, attr->key, attr->value + returns zero and stores found elem into value or negative error + +- create or update key/value pair in a given map + err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) + using attr->map_fd, attr->key, attr->value + returns zero or negative error + +- find and delete element by key in a given map + err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size) + using attr->map_fd, attr->key + +- to delete map: close(fd) + Exiting process will delete maps automatically + +userspace programs use this syscall to create/access maps that eBPF programs +are concurrently updating. + +maps can have different types: hash, array, bloom filter, radix-tree, etc. + +The map is defined by: + . type + . max number of elements + . key size in bytes + . value size in bytes + Testing ------- diff --git a/include/linux/bpf.h b/include/linux/bpf.h new file mode 100644 index 000000000000..48014a71f0fe --- /dev/null +++ b/include/linux/bpf.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef _LINUX_BPF_H +#define _LINUX_BPF_H 1 + +#include +#include + +struct bpf_map; + +/* map is generic key/value storage optionally accesible by eBPF programs */ +struct bpf_map_ops { + /* funcs callable from userspace (via syscall) */ + struct bpf_map *(*map_alloc)(union bpf_attr *attr); + void (*map_free)(struct bpf_map *); +}; + +struct bpf_map { + atomic_t refcnt; + enum bpf_map_type map_type; + u32 key_size; + u32 value_size; + u32 max_entries; + struct bpf_map_ops *ops; + struct work_struct work; +}; + +struct bpf_map_type_list { + struct list_head list_node; + struct bpf_map_ops *ops; + enum bpf_map_type type; +}; + +void bpf_register_map_type(struct bpf_map_type_list *tl); +void bpf_map_put(struct bpf_map *map); + +#endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 479ed0b6be16..f58a10f9670c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -62,4 +62,27 @@ struct bpf_insn { __s32 imm; /* signed immediate constant */ }; +/* BPF syscall commands */ +enum bpf_cmd { + /* create a map with given type and attributes + * fd = bpf(BPF_MAP_CREATE, union bpf_attr *, u32 size) + * returns fd or negative error + * map is deleted when fd is closed + */ + BPF_MAP_CREATE, +}; + +enum bpf_map_type { + BPF_MAP_TYPE_UNSPEC, +}; + +union bpf_attr { + struct { /* anonymous struct used by BPF_MAP_CREATE command */ + __u32 map_type; /* one of enum bpf_map_type */ + __u32 key_size; /* size of key in bytes */ + __u32 value_size; /* size of value in bytes */ + __u32 max_entries; /* max number of entries in a map */ + }; +} __attribute__((aligned(8))); + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 6a71145e2769..e9f7334ed07a 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1 +1 @@ -obj-y := core.o +obj-y := core.o syscall.o diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c new file mode 100644 index 000000000000..428a0e23adc0 --- /dev/null +++ b/kernel/bpf/syscall.c @@ -0,0 +1,169 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include + +static LIST_HEAD(bpf_map_types); + +static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) +{ + struct bpf_map_type_list *tl; + struct bpf_map *map; + + list_for_each_entry(tl, &bpf_map_types, list_node) { + if (tl->type == attr->map_type) { + map = tl->ops->map_alloc(attr); + if (IS_ERR(map)) + return map; + map->ops = tl->ops; + map->map_type = attr->map_type; + return map; + } + } + return ERR_PTR(-EINVAL); +} + +/* boot time registration of different map implementations */ +void bpf_register_map_type(struct bpf_map_type_list *tl) +{ + list_add(&tl->list_node, &bpf_map_types); +} + +/* called from workqueue */ +static void bpf_map_free_deferred(struct work_struct *work) +{ + struct bpf_map *map = container_of(work, struct bpf_map, work); + + /* implementation dependent freeing */ + map->ops->map_free(map); +} + +/* decrement map refcnt and schedule it for freeing via workqueue + * (unrelying map implementation ops->map_free() might sleep) + */ +void bpf_map_put(struct bpf_map *map) +{ + if (atomic_dec_and_test(&map->refcnt)) { + INIT_WORK(&map->work, bpf_map_free_deferred); + schedule_work(&map->work); + } +} + +static int bpf_map_release(struct inode *inode, struct file *filp) +{ + struct bpf_map *map = filp->private_data; + + bpf_map_put(map); + return 0; +} + +static const struct file_operations bpf_map_fops = { + .release = bpf_map_release, +}; + +/* helper macro to check that unused fields 'union bpf_attr' are zero */ +#define CHECK_ATTR(CMD) \ + memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ + sizeof(attr->CMD##_LAST_FIELD), 0, \ + sizeof(*attr) - \ + offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ + sizeof(attr->CMD##_LAST_FIELD)) != NULL + +#define BPF_MAP_CREATE_LAST_FIELD max_entries +/* called via syscall */ +static int map_create(union bpf_attr *attr) +{ + struct bpf_map *map; + int err; + + err = CHECK_ATTR(BPF_MAP_CREATE); + if (err) + return -EINVAL; + + /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ + map = find_and_alloc_map(attr); + if (IS_ERR(map)) + return PTR_ERR(map); + + atomic_set(&map->refcnt, 1); + + err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); + + if (err < 0) + /* failed to allocate fd */ + goto free_map; + + return err; + +free_map: + map->ops->map_free(map); + return err; +} + +SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) +{ + union bpf_attr attr = {}; + int err; + + /* the syscall is limited to root temporarily. This restriction will be + * lifted when security audit is clean. Note that eBPF+tracing must have + * this restriction, since it may pass kernel data to user space + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!access_ok(VERIFY_READ, uattr, 1)) + return -EFAULT; + + if (size > PAGE_SIZE) /* silly large */ + return -E2BIG; + + /* If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + err = get_user(val, addr); + if (err) + return err; + if (val) + return -E2BIG; + } + size = sizeof(attr); + } + + /* copy attributes from user space, may be less than sizeof(bpf_attr) */ + if (copy_from_user(&attr, uattr, size) != 0) + return -EFAULT; + + switch (cmd) { + case BPF_MAP_CREATE: + err = map_create(&attr); + break; + default: + err = -EINVAL; + break; + } + + return err; +} -- cgit v1.2.3-59-g8ed1b From 749730ce42a2121e1c88350d69478bff3994b10a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:16:58 -0700 Subject: bpf: enable bpf syscall on x64 and i386 done as separate commit to ease conflict resolution Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/x86/syscalls/syscall_32.tbl | 1 + arch/x86/syscalls/syscall_64.tbl | 1 + include/linux/syscalls.h | 3 ++- include/uapi/asm-generic/unistd.h | 4 +++- kernel/sys_ni.c | 3 +++ 5 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index 028b78168d85..9fe1b5d002f0 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -363,3 +363,4 @@ 354 i386 seccomp sys_seccomp 355 i386 getrandom sys_getrandom 356 i386 memfd_create sys_memfd_create +357 i386 bpf sys_bpf diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 35dd922727b9..281150b539a2 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -327,6 +327,7 @@ 318 common getrandom sys_getrandom 319 common memfd_create sys_memfd_create 320 common kexec_file_load sys_kexec_file_load +321 common bpf sys_bpf # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 0f86d85a9ce4..bda9b81357cc 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -65,6 +65,7 @@ struct old_linux_dirent; struct perf_event_attr; struct file_handle; struct sigaltstack; +union bpf_attr; #include #include @@ -875,5 +876,5 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags, const char __user *uargs); asmlinkage long sys_getrandom(char __user *buf, size_t count, unsigned int flags); - +asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); #endif diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 11d11bc5c78f..22749c134117 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -705,9 +705,11 @@ __SYSCALL(__NR_seccomp, sys_seccomp) __SYSCALL(__NR_getrandom, sys_getrandom) #define __NR_memfd_create 279 __SYSCALL(__NR_memfd_create, sys_memfd_create) +#define __NR_bpf 280 +__SYSCALL(__NR_bpf, sys_bpf) #undef __NR_syscalls -#define __NR_syscalls 280 +#define __NR_syscalls 281 /* * All syscalls below here should go away really, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 391d4ddb6f4b..b4b5083f5f5e 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -218,3 +218,6 @@ cond_syscall(sys_kcmp); /* operate on Secure Computing state */ cond_syscall(sys_seccomp); + +/* access BPF programs and maps */ +cond_syscall(sys_bpf); -- cgit v1.2.3-59-g8ed1b From db20fd2b01087bdfbe30bce314a198eefedcc42e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:16:59 -0700 Subject: bpf: add lookup/update/delete/iterate methods to BPF maps 'maps' is a generic storage of different types for sharing data between kernel and userspace. The maps are accessed from user space via BPF syscall, which has commands: - create a map with given type and attributes fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size) returns fd or negative error - lookup key in a given map referenced by fd err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key, attr->value returns zero and stores found elem into value or negative error - create or update key/value pair in a given map err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key, attr->value returns zero or negative error - find and delete element by key in a given map err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key - iterate map elements (based on input key return next_key) err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key, attr->next_key - close(fd) deletes the map Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 8 ++ include/uapi/linux/bpf.h | 38 ++++++++ kernel/bpf/syscall.c | 235 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 281 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 48014a71f0fe..2887f3f9da59 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -9,6 +9,7 @@ #include #include +#include struct bpf_map; @@ -17,6 +18,12 @@ struct bpf_map_ops { /* funcs callable from userspace (via syscall) */ struct bpf_map *(*map_alloc)(union bpf_attr *attr); void (*map_free)(struct bpf_map *); + int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); + + /* funcs callable from userspace and from eBPF programs */ + void *(*map_lookup_elem)(struct bpf_map *map, void *key); + int (*map_update_elem)(struct bpf_map *map, void *key, void *value); + int (*map_delete_elem)(struct bpf_map *map, void *key); }; struct bpf_map { @@ -37,5 +44,6 @@ struct bpf_map_type_list { void bpf_register_map_type(struct bpf_map_type_list *tl); void bpf_map_put(struct bpf_map *map); +struct bpf_map *bpf_map_get(struct fd f); #endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f58a10f9670c..395cabd2ca0a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -70,6 +70,35 @@ enum bpf_cmd { * map is deleted when fd is closed */ BPF_MAP_CREATE, + + /* lookup key in a given map + * err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size) + * Using attr->map_fd, attr->key, attr->value + * returns zero and stores found elem into value + * or negative error + */ + BPF_MAP_LOOKUP_ELEM, + + /* create or update key/value pair in a given map + * err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) + * Using attr->map_fd, attr->key, attr->value + * returns zero or negative error + */ + BPF_MAP_UPDATE_ELEM, + + /* find and delete elem by key in a given map + * err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size) + * Using attr->map_fd, attr->key + * returns zero or negative error + */ + BPF_MAP_DELETE_ELEM, + + /* lookup key in a given map and return next key + * err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size) + * Using attr->map_fd, attr->key, attr->next_key + * returns zero and stores next key or negative error + */ + BPF_MAP_GET_NEXT_KEY, }; enum bpf_map_type { @@ -83,6 +112,15 @@ union bpf_attr { __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ }; + + struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ + __u32 map_fd; + __aligned_u64 key; + union { + __aligned_u64 value; + __aligned_u64 next_key; + }; + }; } __attribute__((aligned(8))); #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 428a0e23adc0..f94349ecaf61 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -13,6 +13,7 @@ #include #include #include +#include static LIST_HEAD(bpf_map_types); @@ -111,6 +112,228 @@ free_map: return err; } +/* if error is returned, fd is released. + * On success caller should complete fd access with matching fdput() + */ +struct bpf_map *bpf_map_get(struct fd f) +{ + struct bpf_map *map; + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &bpf_map_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + map = f.file->private_data; + + return map; +} + +/* helper to convert user pointers passed inside __aligned_u64 fields */ +static void __user *u64_to_ptr(__u64 val) +{ + return (void __user *) (unsigned long) val; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value + +static int map_lookup_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *uvalue = u64_to_ptr(attr->value); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *value; + int err; + + if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ESRCH; + rcu_read_lock(); + value = map->ops->map_lookup_elem(map, key); + if (!value) + goto err_unlock; + + err = -EFAULT; + if (copy_to_user(uvalue, value, map->value_size) != 0) + goto err_unlock; + + err = 0; + +err_unlock: + rcu_read_unlock(); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value + +static int map_update_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *uvalue = u64_to_ptr(attr->value); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *value; + int err; + + if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ENOMEM; + value = kmalloc(map->value_size, GFP_USER); + if (!value) + goto free_key; + + err = -EFAULT; + if (copy_from_user(value, uvalue, map->value_size) != 0) + goto free_value; + + /* eBPF program that use maps are running under rcu_read_lock(), + * therefore all map accessors rely on this fact, so do the same here + */ + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value); + rcu_read_unlock(); + +free_value: + kfree(value); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +#define BPF_MAP_DELETE_ELEM_LAST_FIELD key + +static int map_delete_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key; + int err; + + if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + rcu_read_lock(); + err = map->ops->map_delete_elem(map, key); + rcu_read_unlock(); + +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key + +static int map_get_next_key(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *unext_key = u64_to_ptr(attr->next_key); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *next_key; + int err; + + if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ENOMEM; + next_key = kmalloc(map->key_size, GFP_USER); + if (!next_key) + goto free_key; + + rcu_read_lock(); + err = map->ops->map_get_next_key(map, key, next_key); + rcu_read_unlock(); + if (err) + goto free_next_key; + + err = -EFAULT; + if (copy_to_user(unext_key, next_key, map->key_size) != 0) + goto free_next_key; + + err = 0; + +free_next_key: + kfree(next_key); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -160,6 +383,18 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_CREATE: err = map_create(&attr); break; + case BPF_MAP_LOOKUP_ELEM: + err = map_lookup_elem(&attr); + break; + case BPF_MAP_UPDATE_ELEM: + err = map_update_elem(&attr); + break; + case BPF_MAP_DELETE_ELEM: + err = map_delete_elem(&attr); + break; + case BPF_MAP_GET_NEXT_KEY: + err = map_get_next_key(&attr); + break; default: err = -EINVAL; break; -- cgit v1.2.3-59-g8ed1b From 09756af46893c18839062976c3252e93a1beeba7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:00 -0700 Subject: bpf: expand BPF syscall with program load/unload eBPF programs are similar to kernel modules. They are loaded by the user process and automatically unloaded when process exits. Each eBPF program is a safe run-to-completion set of instructions. eBPF verifier statically determines that the program terminates and is safe to execute. The following syscall wrapper can be used to load the program: int bpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, int insn_cnt, const char *license) { union bpf_attr attr = { .prog_type = prog_type, .insns = ptr_to_u64(insns), .insn_cnt = insn_cnt, .license = ptr_to_u64(license), }; return bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); } where 'insns' is an array of eBPF instructions and 'license' is a string that must be GPL compatible to call helper functions marked gpl_only Upon succesful load the syscall returns prog_fd. Use close(prog_fd) to unload the program. User space tests and examples follow in the later patches Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 38 +++++++++++ include/linux/filter.h | 8 +-- include/uapi/linux/bpf.h | 26 ++++++++ kernel/bpf/core.c | 29 +++++---- kernel/bpf/syscall.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 246 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2887f3f9da59..92979182be81 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -46,4 +46,42 @@ void bpf_register_map_type(struct bpf_map_type_list *tl); void bpf_map_put(struct bpf_map *map); struct bpf_map *bpf_map_get(struct fd f); +/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs + * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL + * instructions after verifying + */ +struct bpf_func_proto { + u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); + bool gpl_only; +}; + +struct bpf_verifier_ops { + /* return eBPF function prototype for verification */ + const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); +}; + +struct bpf_prog_type_list { + struct list_head list_node; + struct bpf_verifier_ops *ops; + enum bpf_prog_type type; +}; + +void bpf_register_prog_type(struct bpf_prog_type_list *tl); + +struct bpf_prog; + +struct bpf_prog_aux { + atomic_t refcnt; + bool is_gpl_compatible; + enum bpf_prog_type prog_type; + struct bpf_verifier_ops *ops; + struct bpf_map **used_maps; + u32 used_map_cnt; + struct bpf_prog *prog; + struct work_struct work; +}; + +void bpf_prog_put(struct bpf_prog *prog); +struct bpf_prog *bpf_prog_get(u32 ufd); + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 1a0bc6d134d7..4ffc0958d85e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -21,6 +21,7 @@ struct sk_buff; struct sock; struct seccomp_data; +struct bpf_prog_aux; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -300,17 +301,12 @@ struct bpf_binary_header { u8 image[]; }; -struct bpf_work_struct { - struct bpf_prog *prog; - struct work_struct work; -}; - struct bpf_prog { u16 pages; /* Number of allocated pages */ bool jited; /* Is our filter JIT'ed? */ u32 len; /* Number of filter blocks */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ - struct bpf_work_struct *work; /* Deferred free work struct */ + struct bpf_prog_aux *aux; /* Auxiliary fields */ unsigned int (*bpf_func)(const struct sk_buff *skb, const struct bpf_insn *filter); /* Instructions for interpreter */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 395cabd2ca0a..424f442016e7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -99,12 +99,23 @@ enum bpf_cmd { * returns zero and stores next key or negative error */ BPF_MAP_GET_NEXT_KEY, + + /* verify and load eBPF program + * prog_fd = bpf(BPF_PROG_LOAD, union bpf_attr *attr, u32 size) + * Using attr->prog_type, attr->insns, attr->license + * returns fd or negative error + */ + BPF_PROG_LOAD, }; enum bpf_map_type { BPF_MAP_TYPE_UNSPEC, }; +enum bpf_prog_type { + BPF_PROG_TYPE_UNSPEC, +}; + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -121,6 +132,21 @@ union bpf_attr { __aligned_u64 next_key; }; }; + + struct { /* anonymous struct used by BPF_PROG_LOAD command */ + __u32 prog_type; /* one of enum bpf_prog_type */ + __u32 insn_cnt; + __aligned_u64 insns; + __aligned_u64 license; + }; } __attribute__((aligned(8))); +/* integer value in 'imm' field of BPF_CALL instruction selects which helper + * function eBPF program intends to call + */ +enum bpf_func_id { + BPF_FUNC_unspec, + __BPF_FUNC_MAX_ID, +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8b7002488251..f0c30c59b317 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -27,6 +27,7 @@ #include #include #include +#include /* Registers */ #define BPF_R0 regs[BPF_REG_0] @@ -71,7 +72,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | gfp_extra_flags; - struct bpf_work_struct *ws; + struct bpf_prog_aux *aux; struct bpf_prog *fp; size = round_up(size, PAGE_SIZE); @@ -79,14 +80,14 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) if (fp == NULL) return NULL; - ws = kmalloc(sizeof(*ws), GFP_KERNEL | gfp_extra_flags); - if (ws == NULL) { + aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); + if (aux == NULL) { vfree(fp); return NULL; } fp->pages = size / PAGE_SIZE; - fp->work = ws; + fp->aux = aux; return fp; } @@ -110,10 +111,10 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = size / PAGE_SIZE; - /* We keep fp->work from fp_old around in the new + /* We keep fp->aux from fp_old around in the new * reallocated structure. */ - fp_old->work = NULL; + fp_old->aux = NULL; __bpf_prog_free(fp_old); } @@ -123,7 +124,7 @@ EXPORT_SYMBOL_GPL(bpf_prog_realloc); void __bpf_prog_free(struct bpf_prog *fp) { - kfree(fp->work); + kfree(fp->aux); vfree(fp); } EXPORT_SYMBOL_GPL(__bpf_prog_free); @@ -638,19 +639,19 @@ EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); static void bpf_prog_free_deferred(struct work_struct *work) { - struct bpf_work_struct *ws; + struct bpf_prog_aux *aux; - ws = container_of(work, struct bpf_work_struct, work); - bpf_jit_free(ws->prog); + aux = container_of(work, struct bpf_prog_aux, work); + bpf_jit_free(aux->prog); } /* Free internal BPF program */ void bpf_prog_free(struct bpf_prog *fp) { - struct bpf_work_struct *ws = fp->work; + struct bpf_prog_aux *aux = fp->aux; - INIT_WORK(&ws->work, bpf_prog_free_deferred); - ws->prog = fp; - schedule_work(&ws->work); + INIT_WORK(&aux->work, bpf_prog_free_deferred); + aux->prog = fp; + schedule_work(&aux->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f94349ecaf61..0afb4eaa1887 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include static LIST_HEAD(bpf_map_types); @@ -334,6 +336,166 @@ err_put: return err; } +static LIST_HEAD(bpf_prog_types); + +static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) +{ + struct bpf_prog_type_list *tl; + + list_for_each_entry(tl, &bpf_prog_types, list_node) { + if (tl->type == type) { + prog->aux->ops = tl->ops; + prog->aux->prog_type = type; + return 0; + } + } + return -EINVAL; +} + +void bpf_register_prog_type(struct bpf_prog_type_list *tl) +{ + list_add(&tl->list_node, &bpf_prog_types); +} + +/* drop refcnt on maps used by eBPF program and free auxilary data */ +static void free_used_maps(struct bpf_prog_aux *aux) +{ + int i; + + for (i = 0; i < aux->used_map_cnt; i++) + bpf_map_put(aux->used_maps[i]); + + kfree(aux->used_maps); +} + +void bpf_prog_put(struct bpf_prog *prog) +{ + if (atomic_dec_and_test(&prog->aux->refcnt)) { + free_used_maps(prog->aux); + bpf_prog_free(prog); + } +} + +static int bpf_prog_release(struct inode *inode, struct file *filp) +{ + struct bpf_prog *prog = filp->private_data; + + bpf_prog_put(prog); + return 0; +} + +static const struct file_operations bpf_prog_fops = { + .release = bpf_prog_release, +}; + +static struct bpf_prog *get_prog(struct fd f) +{ + struct bpf_prog *prog; + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &bpf_prog_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + prog = f.file->private_data; + + return prog; +} + +/* called by sockets/tracing/seccomp before attaching program to an event + * pairs with bpf_prog_put() + */ +struct bpf_prog *bpf_prog_get(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_prog *prog; + + prog = get_prog(f); + + if (IS_ERR(prog)) + return prog; + + atomic_inc(&prog->aux->refcnt); + fdput(f); + return prog; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_PROG_LOAD_LAST_FIELD license + +static int bpf_prog_load(union bpf_attr *attr) +{ + enum bpf_prog_type type = attr->prog_type; + struct bpf_prog *prog; + int err; + char license[128]; + bool is_gpl; + + if (CHECK_ATTR(BPF_PROG_LOAD)) + return -EINVAL; + + /* copy eBPF program license from user space */ + if (strncpy_from_user(license, u64_to_ptr(attr->license), + sizeof(license) - 1) < 0) + return -EFAULT; + license[sizeof(license) - 1] = 0; + + /* eBPF programs must be GPL compatible to use GPL-ed functions */ + is_gpl = license_is_gpl_compatible(license); + + if (attr->insn_cnt >= BPF_MAXINSNS) + return -EINVAL; + + /* plain bpf_prog allocation */ + prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); + if (!prog) + return -ENOMEM; + + prog->len = attr->insn_cnt; + + err = -EFAULT; + if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), + prog->len * sizeof(struct bpf_insn)) != 0) + goto free_prog; + + prog->orig_prog = NULL; + prog->jited = false; + + atomic_set(&prog->aux->refcnt, 1); + prog->aux->is_gpl_compatible = is_gpl; + + /* find program type: socket_filter vs tracing_filter */ + err = find_prog_type(type, prog); + if (err < 0) + goto free_prog; + + /* run eBPF verifier */ + /* err = bpf_check(prog, tb); */ + + if (err < 0) + goto free_used_maps; + + /* eBPF program is ready to be JITed */ + bpf_prog_select_runtime(prog); + + err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); + + if (err < 0) + /* failed to allocate fd */ + goto free_used_maps; + + return err; + +free_used_maps: + free_used_maps(prog->aux); +free_prog: + bpf_prog_free(prog); + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -395,6 +557,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_GET_NEXT_KEY: err = map_get_next_key(&attr); break; + case BPF_PROG_LOAD: + err = bpf_prog_load(&attr); + break; default: err = -EINVAL; break; -- cgit v1.2.3-59-g8ed1b From 51580e798cb61b0fc63fa3aa6c5c975375aa0550 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:02 -0700 Subject: bpf: verifier (add docs) this patch adds all of eBPF verfier documentation and empty bpf_check() The end goal for the verifier is to statically check safety of the program. Verifier will catch: - loops - out of range jumps - unreachable instructions - invalid instructions - uninitialized register access - uninitialized stack access - misaligned stack access - out of range stack access - invalid calling convention More details in Documentation/networking/filter.txt Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- Documentation/networking/filter.txt | 224 ++++++++++++++++++++++++++++++++++++ include/linux/bpf.h | 2 + kernel/bpf/Makefile | 2 +- kernel/bpf/syscall.c | 2 +- kernel/bpf/verifier.c | 133 +++++++++++++++++++++ 5 files changed, 361 insertions(+), 2 deletions(-) create mode 100644 kernel/bpf/verifier.c (limited to 'include/linux') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index 4a01d71785e9..5ce4d07406a5 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -1001,6 +1001,99 @@ instruction that loads 64-bit immediate value into a dst_reg. Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads 32-bit immediate value into a register. +eBPF verifier +------------- +The safety of the eBPF program is determined in two steps. + +First step does DAG check to disallow loops and other CFG validation. +In particular it will detect programs that have unreachable instructions. +(though classic BPF checker allows them) + +Second step starts from the first insn and descends all possible paths. +It simulates execution of every insn and observes the state change of +registers and stack. + +At the start of the program the register R1 contains a pointer to context +and has type PTR_TO_CTX. +If verifier sees an insn that does R2=R1, then R2 has now type +PTR_TO_CTX as well and can be used on the right hand side of expression. +If R1=PTR_TO_CTX and insn is R2=R1+R1, then R2=UNKNOWN_VALUE, +since addition of two valid pointers makes invalid pointer. +(In 'secure' mode verifier will reject any type of pointer arithmetic to make +sure that kernel addresses don't leak to unprivileged users) + +If register was never written to, it's not readable: + bpf_mov R0 = R2 + bpf_exit +will be rejected, since R2 is unreadable at the start of the program. + +After kernel function call, R1-R5 are reset to unreadable and +R0 has a return type of the function. + +Since R6-R9 are callee saved, their state is preserved across the call. + bpf_mov R6 = 1 + bpf_call foo + bpf_mov R0 = R6 + bpf_exit +is a correct program. If there was R1 instead of R6, it would have +been rejected. + +load/store instructions are allowed only with registers of valid types, which +are PTR_TO_CTX, PTR_TO_MAP, FRAME_PTR. They are bounds and alignment checked. +For example: + bpf_mov R1 = 1 + bpf_mov R2 = 2 + bpf_xadd *(u32 *)(R1 + 3) += R2 + bpf_exit +will be rejected, since R1 doesn't have a valid pointer type at the time of +execution of instruction bpf_xadd. + +At the start R1 type is PTR_TO_CTX (a pointer to generic 'struct bpf_context') +A callback is used to customize verifier to restrict eBPF program access to only +certain fields within ctx structure with specified size and alignment. + +For example, the following insn: + bpf_ld R0 = *(u32 *)(R6 + 8) +intends to load a word from address R6 + 8 and store it into R0 +If R6=PTR_TO_CTX, via is_valid_access() callback the verifier will know +that offset 8 of size 4 bytes can be accessed for reading, otherwise +the verifier will reject the program. +If R6=FRAME_PTR, then access should be aligned and be within +stack bounds, which are [-MAX_BPF_STACK, 0). In this example offset is 8, +so it will fail verification, since it's out of bounds. + +The verifier will allow eBPF program to read data from stack only after +it wrote into it. +Classic BPF verifier does similar check with M[0-15] memory slots. +For example: + bpf_ld R0 = *(u32 *)(R10 - 4) + bpf_exit +is invalid program. +Though R10 is correct read-only register and has type FRAME_PTR +and R10 - 4 is within stack bounds, there were no stores into that location. + +Pointer register spill/fill is tracked as well, since four (R6-R9) +callee saved registers may not be enough for some programs. + +Allowed function calls are customized with bpf_verifier_ops->get_func_proto() +The eBPF verifier will check that registers match argument constraints. +After the call register R0 will be set to return type of the function. + +Function calls is a main mechanism to extend functionality of eBPF programs. +Socket filters may let programs to call one set of functions, whereas tracing +filters may allow completely different set. + +If a function made accessible to eBPF program, it needs to be thought through +from safety point of view. The verifier will guarantee that the function is +called with valid arguments. + +seccomp vs socket filters have different security restrictions for classic BPF. +Seccomp solves this by two stage verifier: classic BPF verifier is followed +by seccomp verifier. In case of eBPF one configurable verifier is shared for +all use cases. + +See details of eBPF verifier in kernel/bpf/verifier.c + eBPF maps --------- 'maps' is a generic storage of different types for sharing data between kernel @@ -1040,6 +1133,137 @@ The map is defined by: . key size in bytes . value size in bytes +Understanding eBPF verifier messages +------------------------------------ + +The following are few examples of invalid eBPF programs and verifier error +messages as seen in the log: + +Program with unreachable instructions: +static struct bpf_insn prog[] = { + BPF_EXIT_INSN(), + BPF_EXIT_INSN(), +}; +Error: + unreachable insn 1 + +Program that reads uninitialized register: + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), +Error: + 0: (bf) r0 = r2 + R2 !read_ok + +Program that doesn't initialize R0 before exiting: + BPF_MOV64_REG(BPF_REG_2, BPF_REG_1), + BPF_EXIT_INSN(), +Error: + 0: (bf) r2 = r1 + 1: (95) exit + R0 !read_ok + +Program that accesses stack out of bounds: + BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0), + BPF_EXIT_INSN(), +Error: + 0: (7a) *(u64 *)(r10 +8) = 0 + invalid stack off=8 size=8 + +Program that doesn't initialize stack before passing its address into function: + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), +Error: + 0: (bf) r2 = r10 + 1: (07) r2 += -8 + 2: (b7) r1 = 0x0 + 3: (85) call 1 + invalid indirect read from stack off -8+0 size 8 + +Program that uses invalid map_fd=0 while calling to map_lookup_elem() function: + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), +Error: + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 0x0 + 4: (85) call 1 + fd 0 is not pointing to valid bpf_map + +Program that doesn't check return value of map_lookup_elem() before accessing +map element: + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), +Error: + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 0x0 + 4: (85) call 1 + 5: (7a) *(u64 *)(r0 +0) = 0 + R0 invalid mem access 'map_value_or_null' + +Program that correctly checks map_lookup_elem() returned value for NULL, but +accesses the memory with incorrect alignment: + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0), + BPF_EXIT_INSN(), +Error: + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 1 + 4: (85) call 1 + 5: (15) if r0 == 0x0 goto pc+1 + R0=map_ptr R10=fp + 6: (7a) *(u64 *)(r0 +4) = 0 + misaligned access off 4 size 8 + +Program that correctly checks map_lookup_elem() returned value for NULL and +accesses memory with correct alignment in one side of 'if' branch, but fails +to do so in the other side of 'if' branch: + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), +Error: + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 1 + 4: (85) call 1 + 5: (15) if r0 == 0x0 goto pc+2 + R0=map_ptr R10=fp + 6: (7a) *(u64 *)(r0 +0) = 0 + 7: (95) exit + + from 5 to 8: R0=imm0 R10=fp + 8: (7a) *(u64 *)(r0 +0) = 1 + R0 invalid mem access 'imm' + Testing ------- diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 92979182be81..9dfeb36f8971 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -83,5 +83,7 @@ struct bpf_prog_aux { void bpf_prog_put(struct bpf_prog *prog); struct bpf_prog *bpf_prog_get(u32 ufd); +/* verify correctness of eBPF program */ +int bpf_check(struct bpf_prog *fp, union bpf_attr *attr); #endif /* _LINUX_BPF_H */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e9f7334ed07a..3c726b0995b7 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1 +1 @@ -obj-y := core.o syscall.o +obj-y := core.o syscall.o verifier.o diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b513659d120f..74b3628c5fdb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -507,7 +507,7 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_prog; /* run eBPF verifier */ - /* err = bpf_check(prog, tb); */ + err = bpf_check(prog, attr); if (err < 0) goto free_used_maps; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c new file mode 100644 index 000000000000..d6f9c3d6b4d7 --- /dev/null +++ b/kernel/bpf/verifier.c @@ -0,0 +1,133 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +/* bpf_check() is a static code analyzer that walks eBPF program + * instruction by instruction and updates register/stack state. + * All paths of conditional branches are analyzed until 'bpf_exit' insn. + * + * The first pass is depth-first-search to check that the program is a DAG. + * It rejects the following programs: + * - larger than BPF_MAXINSNS insns + * - if loop is present (detected via back-edge) + * - unreachable insns exist (shouldn't be a forest. program = one function) + * - out of bounds or malformed jumps + * The second pass is all possible path descent from the 1st insn. + * Since it's analyzing all pathes through the program, the length of the + * analysis is limited to 32k insn, which may be hit even if total number of + * insn is less then 4K, but there are too many branches that change stack/regs. + * Number of 'branches to be analyzed' is limited to 1k + * + * On entry to each instruction, each register has a type, and the instruction + * changes the types of the registers depending on instruction semantics. + * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is + * copied to R1. + * + * All registers are 64-bit. + * R0 - return register + * R1-R5 argument passing registers + * R6-R9 callee saved registers + * R10 - frame pointer read-only + * + * At the start of BPF program the register R1 contains a pointer to bpf_context + * and has type PTR_TO_CTX. + * + * Verifier tracks arithmetic operations on pointers in case: + * BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + * BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20), + * 1st insn copies R10 (which has FRAME_PTR) type into R1 + * and 2nd arithmetic instruction is pattern matched to recognize + * that it wants to construct a pointer to some element within stack. + * So after 2nd insn, the register R1 has type PTR_TO_STACK + * (and -20 constant is saved for further stack bounds checking). + * Meaning that this reg is a pointer to stack plus known immediate constant. + * + * Most of the time the registers have UNKNOWN_VALUE type, which + * means the register has some value, but it's not a valid pointer. + * (like pointer plus pointer becomes UNKNOWN_VALUE type) + * + * When verifier sees load or store instructions the type of base register + * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer + * types recognized by check_mem_access() function. + * + * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value' + * and the range of [ptr, ptr + map's value_size) is accessible. + * + * registers used to pass values to function calls are checked against + * function argument constraints. + * + * ARG_PTR_TO_MAP_KEY is one of such argument constraints. + * It means that the register type passed to this function must be + * PTR_TO_STACK and it will be used inside the function as + * 'pointer to map element key' + * + * For example the argument constraints for bpf_map_lookup_elem(): + * .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + * .arg1_type = ARG_CONST_MAP_PTR, + * .arg2_type = ARG_PTR_TO_MAP_KEY, + * + * ret_type says that this function returns 'pointer to map elem value or null' + * function expects 1st argument to be a const pointer to 'struct bpf_map' and + * 2nd argument should be a pointer to stack, which will be used inside + * the helper function as a pointer to map element key. + * + * On the kernel side the helper function looks like: + * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) + * { + * struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + * void *key = (void *) (unsigned long) r2; + * void *value; + * + * here kernel can access 'key' and 'map' pointers safely, knowing that + * [key, key + map->key_size) bytes are valid and were initialized on + * the stack of eBPF program. + * } + * + * Corresponding eBPF program may look like: + * BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), // after this insn R2 type is FRAME_PTR + * BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK + * BPF_LD_MAP_FD(BPF_REG_1, map_fd), // after this insn R1 type is CONST_PTR_TO_MAP + * BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + * here verifier looks at prototype of map_lookup_elem() and sees: + * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok, + * Now verifier knows that this map has key of R1->map_ptr->key_size bytes + * + * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far, + * Now verifier checks that [R2, R2 + map's key_size) are within stack limits + * and were initialized prior to this call. + * If it's ok, then verifier allows this BPF_CALL insn and looks at + * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets + * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function + * returns ether pointer to map value or NULL. + * + * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off' + * insn, the register holding that pointer in the true branch changes state to + * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false + * branch. See check_cond_jmp_op(). + * + * After the call R0 is set to return type of the function and registers R1-R5 + * are set to NOT_INIT to indicate that they are no longer readable. + */ + +int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) +{ + int ret = -EINVAL; + + return ret; +} -- cgit v1.2.3-59-g8ed1b From 0246e64d9a5fcd4805198de59b9b5cf1f974eb41 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:04 -0700 Subject: bpf: handle pseudo BPF_LD_IMM64 insn eBPF programs passed from userspace are using pseudo BPF_LD_IMM64 instructions to refer to process-local map_fd. Scan the program for such instructions and if FDs are valid, convert them to 'struct bpf_map' pointers which will be used by verifier to check access to maps in bpf_map_lookup/update() calls. If program passes verifier, convert pseudo BPF_LD_IMM64 into generic by dropping BPF_PSEUDO_MAP_FD flag. Note that eBPF interpreter is generic and knows nothing about pseudo insns. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 6 ++ kernel/bpf/verifier.c | 147 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 4ffc0958d85e..ca95abd2bed1 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -145,6 +145,12 @@ struct bpf_prog_aux; .off = 0, \ .imm = ((__u64) (IMM)) >> 32 }) +#define BPF_PSEUDO_MAP_FD 1 + +/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ +#define BPF_LD_MAP_FD(DST, MAP_FD) \ + BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) + /* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */ #define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 871edc1f2e1f..7227543e474b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -125,10 +125,15 @@ * are set to NOT_INIT to indicate that they are no longer readable. */ +#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ + /* single container for all structs * one verifier_env per bpf_check() call */ struct verifier_env { + struct bpf_prog *prog; /* eBPF program being verified */ + struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ + u32 used_map_cnt; /* number of used maps */ }; /* verbose verifier prints what it's seeing @@ -300,6 +305,115 @@ static void print_bpf_insn(struct bpf_insn *insn) } } +/* return the map pointer stored inside BPF_LD_IMM64 instruction */ +static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) +{ + u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32; + + return (struct bpf_map *) (unsigned long) imm64; +} + +/* look for pseudo eBPF instructions that access map FDs and + * replace them with actual map pointers + */ +static int replace_map_fd_with_map_ptr(struct verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int i, j; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { + struct bpf_map *map; + struct fd f; + + if (i == insn_cnt - 1 || insn[1].code != 0 || + insn[1].dst_reg != 0 || insn[1].src_reg != 0 || + insn[1].off != 0) { + verbose("invalid bpf_ld_imm64 insn\n"); + return -EINVAL; + } + + if (insn->src_reg == 0) + /* valid generic load 64-bit imm */ + goto next_insn; + + if (insn->src_reg != BPF_PSEUDO_MAP_FD) { + verbose("unrecognized bpf_ld_imm64 insn\n"); + return -EINVAL; + } + + f = fdget(insn->imm); + + map = bpf_map_get(f); + if (IS_ERR(map)) { + verbose("fd %d is not pointing to valid bpf_map\n", + insn->imm); + fdput(f); + return PTR_ERR(map); + } + + /* store map pointer inside BPF_LD_IMM64 instruction */ + insn[0].imm = (u32) (unsigned long) map; + insn[1].imm = ((u64) (unsigned long) map) >> 32; + + /* check whether we recorded this map already */ + for (j = 0; j < env->used_map_cnt; j++) + if (env->used_maps[j] == map) { + fdput(f); + goto next_insn; + } + + if (env->used_map_cnt >= MAX_USED_MAPS) { + fdput(f); + return -E2BIG; + } + + /* remember this map */ + env->used_maps[env->used_map_cnt++] = map; + + /* hold the map. If the program is rejected by verifier, + * the map will be released by release_maps() or it + * will be used by the valid program until it's unloaded + * and all maps are released in free_bpf_prog_info() + */ + atomic_inc(&map->refcnt); + + fdput(f); +next_insn: + insn++; + i++; + } + } + + /* now all pseudo BPF_LD_IMM64 instructions load valid + * 'struct bpf_map *' into a register instead of user map_fd. + * These pointers will be used later by verifier to validate map access. + */ + return 0; +} + +/* drop refcnt of maps used by the rejected program */ +static void release_maps(struct verifier_env *env) +{ + int i; + + for (i = 0; i < env->used_map_cnt; i++) + bpf_map_put(env->used_maps[i]); +} + +/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ +static void convert_pseudo_ld_imm64(struct verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int i; + + for (i = 0; i < insn_cnt; i++, insn++) + if (insn->code == (BPF_LD | BPF_IMM | BPF_DW)) + insn->src_reg = 0; +} + int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) { char __user *log_ubuf = NULL; @@ -316,6 +430,8 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) if (!env) return -ENOMEM; + env->prog = prog; + /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); @@ -342,8 +458,14 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) log_level = 0; } + ret = replace_map_fd_with_map_ptr(env); + if (ret < 0) + goto skip_full_check; + /* ret = do_check(env); */ +skip_full_check: + if (log_level && log_len >= log_size - 1) { BUG_ON(log_len >= log_size); /* verifier log exceeded user supplied buffer */ @@ -357,11 +479,36 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) goto free_log_buf; } + if (ret == 0 && env->used_map_cnt) { + /* if program passed verifier, update used_maps in bpf_prog_info */ + prog->aux->used_maps = kmalloc_array(env->used_map_cnt, + sizeof(env->used_maps[0]), + GFP_KERNEL); + + if (!prog->aux->used_maps) { + ret = -ENOMEM; + goto free_log_buf; + } + + memcpy(prog->aux->used_maps, env->used_maps, + sizeof(env->used_maps[0]) * env->used_map_cnt); + prog->aux->used_map_cnt = env->used_map_cnt; + + /* program is valid. Convert pseudo bpf_ld_imm64 into generic + * bpf_ld_imm64 instructions + */ + convert_pseudo_ld_imm64(env); + } free_log_buf: if (log_level) vfree(log_buf); free_env: + if (!prog->aux->used_maps) + /* if we didn't copy map pointers into bpf_prog_info, release + * them now. Otherwise free_bpf_prog_info() will release them. + */ + release_maps(env); kfree(env); mutex_unlock(&bpf_verifier_lock); return ret; -- cgit v1.2.3-59-g8ed1b From 17a5267067f3c372fec9ffb798d6eaba6b5e6a4c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:06 -0700 Subject: bpf: verifier (add verifier core) This patch adds verifier core which simulates execution of every insn and records the state of registers and program stack. Every branch instruction seen during simulation is pushed into state stack. When verifier reaches BPF_EXIT, it pops the state from the stack and continues until it reaches BPF_EXIT again. For program: 1: bpf_mov r1, xxx 2: if (r1 == 0) goto 5 3: bpf_mov r0, 1 4: goto 6 5: bpf_mov r0, 2 6: bpf_exit The verifier will walk insns: 1, 2, 3, 4, 6 then it will pop the state recorded at insn#2 and will continue: 5, 6 This way it walks all possible paths through the program and checks all possible values of registers. While doing so, it checks for: - invalid instructions - uninitialized register access - uninitialized stack access - misaligned stack access - out of range stack access - invalid calling convention - instruction encoding is not using reserved fields Kernel subsystem configures the verifier with two callbacks: - bool (*is_valid_access)(int off, int size, enum bpf_access_type type); that provides information to the verifer which fields of 'ctx' are accessible (remember 'ctx' is the first argument to eBPF program) - const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); returns argument constraints of kernel helper functions that eBPF program may call, so that verifier can checks that R1-R5 types match the prototype More details in Documentation/networking/filter.txt and in kernel/bpf/verifier.c Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 47 +++ kernel/bpf/verifier.c | 1075 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 1121 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9dfeb36f8971..3cf91754a957 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -46,6 +46,31 @@ void bpf_register_map_type(struct bpf_map_type_list *tl); void bpf_map_put(struct bpf_map *map); struct bpf_map *bpf_map_get(struct fd f); +/* function argument constraints */ +enum bpf_arg_type { + ARG_ANYTHING = 0, /* any argument is ok */ + + /* the following constraints used to prototype + * bpf_map_lookup/update/delete_elem() functions + */ + ARG_CONST_MAP_PTR, /* const argument used as pointer to bpf_map */ + ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ + ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ + + /* the following constraints used to prototype bpf_memcmp() and other + * functions that access data on eBPF program stack + */ + ARG_PTR_TO_STACK, /* any pointer to eBPF program stack */ + ARG_CONST_STACK_SIZE, /* number of bytes accessed from stack */ +}; + +/* type of values returned from helper functions */ +enum bpf_return_type { + RET_INTEGER, /* function returns integer */ + RET_VOID, /* function doesn't return anything */ + RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ +}; + /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL * instructions after verifying @@ -53,11 +78,33 @@ struct bpf_map *bpf_map_get(struct fd f); struct bpf_func_proto { u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); bool gpl_only; + enum bpf_return_type ret_type; + enum bpf_arg_type arg1_type; + enum bpf_arg_type arg2_type; + enum bpf_arg_type arg3_type; + enum bpf_arg_type arg4_type; + enum bpf_arg_type arg5_type; +}; + +/* bpf_context is intentionally undefined structure. Pointer to bpf_context is + * the first argument to eBPF programs. + * For socket filters: 'struct bpf_context *' == 'struct sk_buff *' + */ +struct bpf_context; + +enum bpf_access_type { + BPF_READ = 1, + BPF_WRITE = 2 }; struct bpf_verifier_ops { /* return eBPF function prototype for verification */ const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); + + /* return true if 'size' wide access at offset 'off' within bpf_context + * with 'type' (read or write) is allowed + */ + bool (*is_valid_access)(int off, int size, enum bpf_access_type type); }; struct bpf_prog_type_list { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c689ab8e2713..a086dd3210a8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -125,6 +125,70 @@ * are set to NOT_INIT to indicate that they are no longer readable. */ +/* types of values stored in eBPF registers */ +enum bpf_reg_type { + NOT_INIT = 0, /* nothing was written into register */ + UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */ + PTR_TO_CTX, /* reg points to bpf_context */ + CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ + PTR_TO_MAP_VALUE, /* reg points to map element value */ + PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ + FRAME_PTR, /* reg == frame_pointer */ + PTR_TO_STACK, /* reg == frame_pointer + imm */ + CONST_IMM, /* constant integer value */ +}; + +struct reg_state { + enum bpf_reg_type type; + union { + /* valid when type == CONST_IMM | PTR_TO_STACK */ + int imm; + + /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | + * PTR_TO_MAP_VALUE_OR_NULL + */ + struct bpf_map *map_ptr; + }; +}; + +enum bpf_stack_slot_type { + STACK_INVALID, /* nothing was stored in this stack slot */ + STACK_SPILL, /* 1st byte of register spilled into stack */ + STACK_SPILL_PART, /* other 7 bytes of register spill */ + STACK_MISC /* BPF program wrote some data into this slot */ +}; + +struct bpf_stack_slot { + enum bpf_stack_slot_type stype; + struct reg_state reg_st; +}; + +/* state of the program: + * type of all registers and stack info + */ +struct verifier_state { + struct reg_state regs[MAX_BPF_REG]; + struct bpf_stack_slot stack[MAX_BPF_STACK]; +}; + +/* linked list of verifier states used to prune search */ +struct verifier_state_list { + struct verifier_state state; + struct verifier_state_list *next; +}; + +/* verifier_state + insn_idx are pushed to stack when branch is encountered */ +struct verifier_stack_elem { + /* verifer state is 'st' + * before processing instruction 'insn_idx' + * and after processing instruction 'prev_insn_idx' + */ + struct verifier_state st; + int insn_idx; + int prev_insn_idx; + struct verifier_stack_elem *next; +}; + #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ /* single container for all structs @@ -132,6 +196,9 @@ */ struct verifier_env { struct bpf_prog *prog; /* eBPF program being verified */ + struct verifier_stack_elem *head; /* stack of verifier states to be processed */ + int stack_size; /* number of states to be processed */ + struct verifier_state cur_state; /* current verifier state */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ }; @@ -160,6 +227,45 @@ static void verbose(const char *fmt, ...) va_end(args); } +/* string representation of 'enum bpf_reg_type' */ +static const char * const reg_type_str[] = { + [NOT_INIT] = "?", + [UNKNOWN_VALUE] = "inv", + [PTR_TO_CTX] = "ctx", + [CONST_PTR_TO_MAP] = "map_ptr", + [PTR_TO_MAP_VALUE] = "map_value", + [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", + [FRAME_PTR] = "fp", + [PTR_TO_STACK] = "fp", + [CONST_IMM] = "imm", +}; + +static void print_verifier_state(struct verifier_env *env) +{ + enum bpf_reg_type t; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) { + t = env->cur_state.regs[i].type; + if (t == NOT_INIT) + continue; + verbose(" R%d=%s", i, reg_type_str[t]); + if (t == CONST_IMM || t == PTR_TO_STACK) + verbose("%d", env->cur_state.regs[i].imm); + else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || + t == PTR_TO_MAP_VALUE_OR_NULL) + verbose("(ks=%d,vs=%d)", + env->cur_state.regs[i].map_ptr->key_size, + env->cur_state.regs[i].map_ptr->value_size); + } + for (i = 0; i < MAX_BPF_STACK; i++) { + if (env->cur_state.stack[i].stype == STACK_SPILL) + verbose(" fp%d=%s", -MAX_BPF_STACK + i, + reg_type_str[env->cur_state.stack[i].reg_st.type]); + } + verbose("\n"); +} + static const char *const bpf_class_string[] = { [BPF_LD] = "ld", [BPF_LDX] = "ldx", @@ -305,6 +411,735 @@ static void print_bpf_insn(struct bpf_insn *insn) } } +static int pop_stack(struct verifier_env *env, int *prev_insn_idx) +{ + struct verifier_stack_elem *elem; + int insn_idx; + + if (env->head == NULL) + return -1; + + memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state)); + insn_idx = env->head->insn_idx; + if (prev_insn_idx) + *prev_insn_idx = env->head->prev_insn_idx; + elem = env->head->next; + kfree(env->head); + env->head = elem; + env->stack_size--; + return insn_idx; +} + +static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx, + int prev_insn_idx) +{ + struct verifier_stack_elem *elem; + + elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL); + if (!elem) + goto err; + + memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state)); + elem->insn_idx = insn_idx; + elem->prev_insn_idx = prev_insn_idx; + elem->next = env->head; + env->head = elem; + env->stack_size++; + if (env->stack_size > 1024) { + verbose("BPF program is too complex\n"); + goto err; + } + return &elem->st; +err: + /* pop all elements and return */ + while (pop_stack(env, NULL) >= 0); + return NULL; +} + +#define CALLER_SAVED_REGS 6 +static const int caller_saved[CALLER_SAVED_REGS] = { + BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 +}; + +static void init_reg_state(struct reg_state *regs) +{ + int i; + + for (i = 0; i < MAX_BPF_REG; i++) { + regs[i].type = NOT_INIT; + regs[i].imm = 0; + regs[i].map_ptr = NULL; + } + + /* frame pointer */ + regs[BPF_REG_FP].type = FRAME_PTR; + + /* 1st arg to a function */ + regs[BPF_REG_1].type = PTR_TO_CTX; +} + +static void mark_reg_unknown_value(struct reg_state *regs, u32 regno) +{ + BUG_ON(regno >= MAX_BPF_REG); + regs[regno].type = UNKNOWN_VALUE; + regs[regno].imm = 0; + regs[regno].map_ptr = NULL; +} + +enum reg_arg_type { + SRC_OP, /* register is used as source operand */ + DST_OP, /* register is used as destination operand */ + DST_OP_NO_MARK /* same as above, check only, don't mark */ +}; + +static int check_reg_arg(struct reg_state *regs, u32 regno, + enum reg_arg_type t) +{ + if (regno >= MAX_BPF_REG) { + verbose("R%d is invalid\n", regno); + return -EINVAL; + } + + if (t == SRC_OP) { + /* check whether register used as source operand can be read */ + if (regs[regno].type == NOT_INIT) { + verbose("R%d !read_ok\n", regno); + return -EACCES; + } + } else { + /* check whether register used as dest operand can be written to */ + if (regno == BPF_REG_FP) { + verbose("frame pointer is read only\n"); + return -EACCES; + } + if (t == DST_OP) + mark_reg_unknown_value(regs, regno); + } + return 0; +} + +static int bpf_size_to_bytes(int bpf_size) +{ + if (bpf_size == BPF_W) + return 4; + else if (bpf_size == BPF_H) + return 2; + else if (bpf_size == BPF_B) + return 1; + else if (bpf_size == BPF_DW) + return 8; + else + return -EINVAL; +} + +/* check_stack_read/write functions track spill/fill of registers, + * stack boundary and alignment are checked in check_mem_access() + */ +static int check_stack_write(struct verifier_state *state, int off, int size, + int value_regno) +{ + struct bpf_stack_slot *slot; + int i; + + if (value_regno >= 0 && + (state->regs[value_regno].type == PTR_TO_MAP_VALUE || + state->regs[value_regno].type == PTR_TO_STACK || + state->regs[value_regno].type == PTR_TO_CTX)) { + + /* register containing pointer is being spilled into stack */ + if (size != 8) { + verbose("invalid size of register spill\n"); + return -EACCES; + } + + slot = &state->stack[MAX_BPF_STACK + off]; + slot->stype = STACK_SPILL; + /* save register state */ + slot->reg_st = state->regs[value_regno]; + for (i = 1; i < 8; i++) { + slot = &state->stack[MAX_BPF_STACK + off + i]; + slot->stype = STACK_SPILL_PART; + slot->reg_st.type = UNKNOWN_VALUE; + slot->reg_st.map_ptr = NULL; + } + } else { + + /* regular write of data into stack */ + for (i = 0; i < size; i++) { + slot = &state->stack[MAX_BPF_STACK + off + i]; + slot->stype = STACK_MISC; + slot->reg_st.type = UNKNOWN_VALUE; + slot->reg_st.map_ptr = NULL; + } + } + return 0; +} + +static int check_stack_read(struct verifier_state *state, int off, int size, + int value_regno) +{ + int i; + struct bpf_stack_slot *slot; + + slot = &state->stack[MAX_BPF_STACK + off]; + + if (slot->stype == STACK_SPILL) { + if (size != 8) { + verbose("invalid size of register spill\n"); + return -EACCES; + } + for (i = 1; i < 8; i++) { + if (state->stack[MAX_BPF_STACK + off + i].stype != + STACK_SPILL_PART) { + verbose("corrupted spill memory\n"); + return -EACCES; + } + } + + if (value_regno >= 0) + /* restore register state from stack */ + state->regs[value_regno] = slot->reg_st; + return 0; + } else { + for (i = 0; i < size; i++) { + if (state->stack[MAX_BPF_STACK + off + i].stype != + STACK_MISC) { + verbose("invalid read from stack off %d+%d size %d\n", + off, i, size); + return -EACCES; + } + } + if (value_regno >= 0) + /* have read misc data from the stack */ + mark_reg_unknown_value(state->regs, value_regno); + return 0; + } +} + +/* check read/write into map element returned by bpf_map_lookup_elem() */ +static int check_map_access(struct verifier_env *env, u32 regno, int off, + int size) +{ + struct bpf_map *map = env->cur_state.regs[regno].map_ptr; + + if (off < 0 || off + size > map->value_size) { + verbose("invalid access to map value, value_size=%d off=%d size=%d\n", + map->value_size, off, size); + return -EACCES; + } + return 0; +} + +/* check access to 'struct bpf_context' fields */ +static int check_ctx_access(struct verifier_env *env, int off, int size, + enum bpf_access_type t) +{ + if (env->prog->aux->ops->is_valid_access && + env->prog->aux->ops->is_valid_access(off, size, t)) + return 0; + + verbose("invalid bpf_context access off=%d size=%d\n", off, size); + return -EACCES; +} + +/* check whether memory at (regno + off) is accessible for t = (read | write) + * if t==write, value_regno is a register which value is stored into memory + * if t==read, value_regno is a register which will receive the value from memory + * if t==write && value_regno==-1, some unknown value is stored into memory + * if t==read && value_regno==-1, don't care what we read from memory + */ +static int check_mem_access(struct verifier_env *env, u32 regno, int off, + int bpf_size, enum bpf_access_type t, + int value_regno) +{ + struct verifier_state *state = &env->cur_state; + int size, err = 0; + + size = bpf_size_to_bytes(bpf_size); + if (size < 0) + return size; + + if (off % size != 0) { + verbose("misaligned access off %d size %d\n", off, size); + return -EACCES; + } + + if (state->regs[regno].type == PTR_TO_MAP_VALUE) { + err = check_map_access(env, regno, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown_value(state->regs, value_regno); + + } else if (state->regs[regno].type == PTR_TO_CTX) { + err = check_ctx_access(env, off, size, t); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown_value(state->regs, value_regno); + + } else if (state->regs[regno].type == FRAME_PTR) { + if (off >= 0 || off < -MAX_BPF_STACK) { + verbose("invalid stack off=%d size=%d\n", off, size); + return -EACCES; + } + if (t == BPF_WRITE) + err = check_stack_write(state, off, size, value_regno); + else + err = check_stack_read(state, off, size, value_regno); + } else { + verbose("R%d invalid mem access '%s'\n", + regno, reg_type_str[state->regs[regno].type]); + return -EACCES; + } + return err; +} + +static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + int err; + + if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || + insn->imm != 0) { + verbose("BPF_XADD uses reserved fields\n"); + return -EINVAL; + } + + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check whether atomic_add can read the memory */ + err = check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, -1); + if (err) + return err; + + /* check whether atomic_add can write into the same memory */ + return check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, -1); +} + +/* when register 'regno' is passed into function that will read 'access_size' + * bytes from that pointer, make sure that it's within stack boundary + * and all elements of stack are initialized + */ +static int check_stack_boundary(struct verifier_env *env, + int regno, int access_size) +{ + struct verifier_state *state = &env->cur_state; + struct reg_state *regs = state->regs; + int off, i; + + if (regs[regno].type != PTR_TO_STACK) + return -EACCES; + + off = regs[regno].imm; + if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || + access_size <= 0) { + verbose("invalid stack type R%d off=%d access_size=%d\n", + regno, off, access_size); + return -EACCES; + } + + for (i = 0; i < access_size; i++) { + if (state->stack[MAX_BPF_STACK + off + i].stype != STACK_MISC) { + verbose("invalid indirect read from stack off %d+%d size %d\n", + off, i, access_size); + return -EACCES; + } + } + return 0; +} + +static int check_func_arg(struct verifier_env *env, u32 regno, + enum bpf_arg_type arg_type, struct bpf_map **mapp) +{ + struct reg_state *reg = env->cur_state.regs + regno; + enum bpf_reg_type expected_type; + int err = 0; + + if (arg_type == ARG_ANYTHING) + return 0; + + if (reg->type == NOT_INIT) { + verbose("R%d !read_ok\n", regno); + return -EACCES; + } + + if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || + arg_type == ARG_PTR_TO_MAP_VALUE) { + expected_type = PTR_TO_STACK; + } else if (arg_type == ARG_CONST_STACK_SIZE) { + expected_type = CONST_IMM; + } else if (arg_type == ARG_CONST_MAP_PTR) { + expected_type = CONST_PTR_TO_MAP; + } else { + verbose("unsupported arg_type %d\n", arg_type); + return -EFAULT; + } + + if (reg->type != expected_type) { + verbose("R%d type=%s expected=%s\n", regno, + reg_type_str[reg->type], reg_type_str[expected_type]); + return -EACCES; + } + + if (arg_type == ARG_CONST_MAP_PTR) { + /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ + *mapp = reg->map_ptr; + + } else if (arg_type == ARG_PTR_TO_MAP_KEY) { + /* bpf_map_xxx(..., map_ptr, ..., key) call: + * check that [key, key + map->key_size) are within + * stack limits and initialized + */ + if (!*mapp) { + /* in function declaration map_ptr must come before + * map_key, so that it's verified and known before + * we have to check map_key here. Otherwise it means + * that kernel subsystem misconfigured verifier + */ + verbose("invalid map_ptr to access map->key\n"); + return -EACCES; + } + err = check_stack_boundary(env, regno, (*mapp)->key_size); + + } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { + /* bpf_map_xxx(..., map_ptr, ..., value) call: + * check [value, value + map->value_size) validity + */ + if (!*mapp) { + /* kernel subsystem misconfigured verifier */ + verbose("invalid map_ptr to access map->value\n"); + return -EACCES; + } + err = check_stack_boundary(env, regno, (*mapp)->value_size); + + } else if (arg_type == ARG_CONST_STACK_SIZE) { + /* bpf_xxx(..., buf, len) call will access 'len' bytes + * from stack pointer 'buf'. Check it + * note: regno == len, regno - 1 == buf + */ + if (regno == 0) { + /* kernel subsystem misconfigured verifier */ + verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); + return -EACCES; + } + err = check_stack_boundary(env, regno - 1, reg->imm); + } + + return err; +} + +static int check_call(struct verifier_env *env, int func_id) +{ + struct verifier_state *state = &env->cur_state; + const struct bpf_func_proto *fn = NULL; + struct reg_state *regs = state->regs; + struct bpf_map *map = NULL; + struct reg_state *reg; + int i, err; + + /* find function prototype */ + if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { + verbose("invalid func %d\n", func_id); + return -EINVAL; + } + + if (env->prog->aux->ops->get_func_proto) + fn = env->prog->aux->ops->get_func_proto(func_id); + + if (!fn) { + verbose("unknown func %d\n", func_id); + return -EINVAL; + } + + /* eBPF programs must be GPL compatible to use GPL-ed functions */ + if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) { + verbose("cannot call GPL only function from proprietary program\n"); + return -EINVAL; + } + + /* check args */ + err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map); + if (err) + return err; + + /* reset caller saved regs */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + reg = regs + caller_saved[i]; + reg->type = NOT_INIT; + reg->imm = 0; + } + + /* update return register */ + if (fn->ret_type == RET_INTEGER) { + regs[BPF_REG_0].type = UNKNOWN_VALUE; + } else if (fn->ret_type == RET_VOID) { + regs[BPF_REG_0].type = NOT_INIT; + } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + /* remember map_ptr, so that check_map_access() + * can check 'value_size' boundary of memory access + * to map element returned from bpf_map_lookup_elem() + */ + if (map == NULL) { + verbose("kernel subsystem misconfigured verifier\n"); + return -EINVAL; + } + regs[BPF_REG_0].map_ptr = map; + } else { + verbose("unknown return type %d of func %d\n", + fn->ret_type, func_id); + return -EINVAL; + } + return 0; +} + +/* check validity of 32-bit and 64-bit arithmetic operations */ +static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) +{ + u8 opcode = BPF_OP(insn->code); + int err; + + if (opcode == BPF_END || opcode == BPF_NEG) { + if (opcode == BPF_NEG) { + if (BPF_SRC(insn->code) != 0 || + insn->src_reg != BPF_REG_0 || + insn->off != 0 || insn->imm != 0) { + verbose("BPF_NEG uses reserved fields\n"); + return -EINVAL; + } + } else { + if (insn->src_reg != BPF_REG_0 || insn->off != 0 || + (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) { + verbose("BPF_END uses reserved fields\n"); + return -EINVAL; + } + } + + /* check src operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check dest operand */ + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + } else if (opcode == BPF_MOV) { + + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->imm != 0 || insn->off != 0) { + verbose("BPF_MOV uses reserved fields\n"); + return -EINVAL; + } + + /* check src operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } else { + if (insn->src_reg != BPF_REG_0 || insn->off != 0) { + verbose("BPF_MOV uses reserved fields\n"); + return -EINVAL; + } + } + + /* check dest operand */ + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + if (BPF_SRC(insn->code) == BPF_X) { + if (BPF_CLASS(insn->code) == BPF_ALU64) { + /* case: R1 = R2 + * copy register state to dest reg + */ + regs[insn->dst_reg] = regs[insn->src_reg]; + } else { + regs[insn->dst_reg].type = UNKNOWN_VALUE; + regs[insn->dst_reg].map_ptr = NULL; + } + } else { + /* case: R = imm + * remember the value we stored into this reg + */ + regs[insn->dst_reg].type = CONST_IMM; + regs[insn->dst_reg].imm = insn->imm; + } + + } else if (opcode > BPF_END) { + verbose("invalid BPF_ALU opcode %x\n", opcode); + return -EINVAL; + + } else { /* all other ALU ops: and, sub, xor, add, ... */ + + bool stack_relative = false; + + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->imm != 0 || insn->off != 0) { + verbose("BPF_ALU uses reserved fields\n"); + return -EINVAL; + } + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } else { + if (insn->src_reg != BPF_REG_0 || insn->off != 0) { + verbose("BPF_ALU uses reserved fields\n"); + return -EINVAL; + } + } + + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + if ((opcode == BPF_MOD || opcode == BPF_DIV) && + BPF_SRC(insn->code) == BPF_K && insn->imm == 0) { + verbose("div by zero\n"); + return -EINVAL; + } + + /* pattern match 'bpf_add Rx, imm' instruction */ + if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && + regs[insn->dst_reg].type == FRAME_PTR && + BPF_SRC(insn->code) == BPF_K) + stack_relative = true; + + /* check dest operand */ + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + if (stack_relative) { + regs[insn->dst_reg].type = PTR_TO_STACK; + regs[insn->dst_reg].imm = insn->imm; + } + } + + return 0; +} + +static int check_cond_jmp_op(struct verifier_env *env, + struct bpf_insn *insn, int *insn_idx) +{ + struct reg_state *regs = env->cur_state.regs; + struct verifier_state *other_branch; + u8 opcode = BPF_OP(insn->code); + int err; + + if (opcode > BPF_EXIT) { + verbose("invalid BPF_JMP opcode %x\n", opcode); + return -EINVAL; + } + + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->imm != 0) { + verbose("BPF_JMP uses reserved fields\n"); + return -EINVAL; + } + + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } else { + if (insn->src_reg != BPF_REG_0) { + verbose("BPF_JMP uses reserved fields\n"); + return -EINVAL; + } + } + + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* detect if R == 0 where R was initialized to zero earlier */ + if (BPF_SRC(insn->code) == BPF_K && + (opcode == BPF_JEQ || opcode == BPF_JNE) && + regs[insn->dst_reg].type == CONST_IMM && + regs[insn->dst_reg].imm == insn->imm) { + if (opcode == BPF_JEQ) { + /* if (imm == imm) goto pc+off; + * only follow the goto, ignore fall-through + */ + *insn_idx += insn->off; + return 0; + } else { + /* if (imm != imm) goto pc+off; + * only follow fall-through branch, since + * that's where the program will go + */ + return 0; + } + } + + other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); + if (!other_branch) + return -EFAULT; + + /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */ + if (BPF_SRC(insn->code) == BPF_K && + insn->imm == 0 && (opcode == BPF_JEQ || + opcode == BPF_JNE) && + regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) { + if (opcode == BPF_JEQ) { + /* next fallthrough insn can access memory via + * this register + */ + regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; + /* branch targer cannot access it, since reg == 0 */ + other_branch->regs[insn->dst_reg].type = CONST_IMM; + other_branch->regs[insn->dst_reg].imm = 0; + } else { + other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; + regs[insn->dst_reg].type = CONST_IMM; + regs[insn->dst_reg].imm = 0; + } + } else if (BPF_SRC(insn->code) == BPF_K && + (opcode == BPF_JEQ || opcode == BPF_JNE)) { + + if (opcode == BPF_JEQ) { + /* detect if (R == imm) goto + * and in the target state recognize that R = imm + */ + other_branch->regs[insn->dst_reg].type = CONST_IMM; + other_branch->regs[insn->dst_reg].imm = insn->imm; + } else { + /* detect if (R != imm) goto + * and in the fall-through state recognize that R = imm + */ + regs[insn->dst_reg].type = CONST_IMM; + regs[insn->dst_reg].imm = insn->imm; + } + } + if (log_level) + print_verifier_state(env); + return 0; +} + /* return the map pointer stored inside BPF_LD_IMM64 instruction */ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) { @@ -313,6 +1148,37 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) return (struct bpf_map *) (unsigned long) imm64; } +/* verify BPF_LD_IMM64 instruction */ +static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + int err; + + if (BPF_SIZE(insn->code) != BPF_DW) { + verbose("invalid BPF_LD_IMM insn\n"); + return -EINVAL; + } + if (insn->off != 0) { + verbose("BPF_LD_IMM64 uses reserved fields\n"); + return -EINVAL; + } + + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + if (insn->src_reg == 0) + /* generic move 64-bit immediate into a register */ + return 0; + + /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ + BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); + + regs[insn->dst_reg].type = CONST_PTR_TO_MAP; + regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn); + return 0; +} + /* non-recursive DFS pseudo code * 1 procedure DFS-iterative(G,v): * 2 label v as discovered @@ -498,6 +1364,212 @@ err_free: return ret; } +static int do_check(struct verifier_env *env) +{ + struct verifier_state *state = &env->cur_state; + struct bpf_insn *insns = env->prog->insnsi; + struct reg_state *regs = state->regs; + int insn_cnt = env->prog->len; + int insn_idx, prev_insn_idx = 0; + int insn_processed = 0; + bool do_print_state = false; + + init_reg_state(regs); + insn_idx = 0; + for (;;) { + struct bpf_insn *insn; + u8 class; + int err; + + if (insn_idx >= insn_cnt) { + verbose("invalid insn idx %d insn_cnt %d\n", + insn_idx, insn_cnt); + return -EFAULT; + } + + insn = &insns[insn_idx]; + class = BPF_CLASS(insn->code); + + if (++insn_processed > 32768) { + verbose("BPF program is too large. Proccessed %d insn\n", + insn_processed); + return -E2BIG; + } + + if (log_level && do_print_state) { + verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx); + print_verifier_state(env); + do_print_state = false; + } + + if (log_level) { + verbose("%d: ", insn_idx); + print_bpf_insn(insn); + } + + if (class == BPF_ALU || class == BPF_ALU64) { + err = check_alu_op(regs, insn); + if (err) + return err; + + } else if (class == BPF_LDX) { + if (BPF_MODE(insn->code) != BPF_MEM || + insn->imm != 0) { + verbose("BPF_LDX uses reserved fields\n"); + return -EINVAL; + } + /* check src operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + + err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK); + if (err) + return err; + + /* check that memory (src_reg + off) is readable, + * the state of dst_reg will be updated by this func + */ + err = check_mem_access(env, insn->src_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, + insn->dst_reg); + if (err) + return err; + + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_XADD) { + err = check_xadd(env, insn); + if (err) + return err; + insn_idx++; + continue; + } + + if (BPF_MODE(insn->code) != BPF_MEM || + insn->imm != 0) { + verbose("BPF_STX uses reserved fields\n"); + return -EINVAL; + } + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check that memory (dst_reg + off) is writeable */ + err = check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, + insn->src_reg); + if (err) + return err; + + } else if (class == BPF_ST) { + if (BPF_MODE(insn->code) != BPF_MEM || + insn->src_reg != BPF_REG_0) { + verbose("BPF_ST uses reserved fields\n"); + return -EINVAL; + } + /* check src operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check that memory (dst_reg + off) is writeable */ + err = check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, + -1); + if (err) + return err; + + } else if (class == BPF_JMP) { + u8 opcode = BPF_OP(insn->code); + + if (opcode == BPF_CALL) { + if (BPF_SRC(insn->code) != BPF_K || + insn->off != 0 || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0) { + verbose("BPF_CALL uses reserved fields\n"); + return -EINVAL; + } + + err = check_call(env, insn->imm); + if (err) + return err; + + } else if (opcode == BPF_JA) { + if (BPF_SRC(insn->code) != BPF_K || + insn->imm != 0 || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0) { + verbose("BPF_JA uses reserved fields\n"); + return -EINVAL; + } + + insn_idx += insn->off + 1; + continue; + + } else if (opcode == BPF_EXIT) { + if (BPF_SRC(insn->code) != BPF_K || + insn->imm != 0 || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0) { + verbose("BPF_EXIT uses reserved fields\n"); + return -EINVAL; + } + + /* eBPF calling convetion is such that R0 is used + * to return the value from eBPF program. + * Make sure that it's readable at this time + * of bpf_exit, which means that program wrote + * something into it earlier + */ + err = check_reg_arg(regs, BPF_REG_0, SRC_OP); + if (err) + return err; + + insn_idx = pop_stack(env, &prev_insn_idx); + if (insn_idx < 0) { + break; + } else { + do_print_state = true; + continue; + } + } else { + err = check_cond_jmp_op(env, insn, &insn_idx); + if (err) + return err; + } + } else if (class == BPF_LD) { + u8 mode = BPF_MODE(insn->code); + + if (mode == BPF_ABS || mode == BPF_IND) { + verbose("LD_ABS is not supported yet\n"); + return -EINVAL; + } else if (mode == BPF_IMM) { + err = check_ld_imm(env, insn); + if (err) + return err; + + insn_idx++; + } else { + verbose("invalid BPF_LD mode\n"); + return -EINVAL; + } + } else { + verbose("unknown insn class %d\n", class); + return -EINVAL; + } + + insn_idx++; + } + + return 0; +} + /* look for pseudo eBPF instructions that access map FDs and * replace them with actual map pointers */ @@ -651,9 +1723,10 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) if (ret < 0) goto skip_full_check; - /* ret = do_check(env); */ + ret = do_check(env); skip_full_check: + while (pop_stack(env, NULL) >= 0); if (log_level && log_len >= log_size - 1) { BUG_ON(log_len >= log_size); -- cgit v1.2.3-59-g8ed1b From 6ea754eb761d9e7a8ac6fa462b05f9e4cf04fb6c Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 22 Sep 2014 11:10:50 -0700 Subject: net: Change netdev_ logging functions to return void No caller or macro uses the return value so make all the functions return void. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/linux/netdevice.h | 19 +++++++++---------- net/core/dev.c | 44 +++++++++++++++++--------------------------- 2 files changed, 26 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9f5d293a0281..9b7fbacb6296 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3612,22 +3612,22 @@ static inline const char *netdev_reg_state(const struct net_device *dev) } __printf(3, 4) -int netdev_printk(const char *level, const struct net_device *dev, - const char *format, ...); +void netdev_printk(const char *level, const struct net_device *dev, + const char *format, ...); __printf(2, 3) -int netdev_emerg(const struct net_device *dev, const char *format, ...); +void netdev_emerg(const struct net_device *dev, const char *format, ...); __printf(2, 3) -int netdev_alert(const struct net_device *dev, const char *format, ...); +void netdev_alert(const struct net_device *dev, const char *format, ...); __printf(2, 3) -int netdev_crit(const struct net_device *dev, const char *format, ...); +void netdev_crit(const struct net_device *dev, const char *format, ...); __printf(2, 3) -int netdev_err(const struct net_device *dev, const char *format, ...); +void netdev_err(const struct net_device *dev, const char *format, ...); __printf(2, 3) -int netdev_warn(const struct net_device *dev, const char *format, ...); +void netdev_warn(const struct net_device *dev, const char *format, ...); __printf(2, 3) -int netdev_notice(const struct net_device *dev, const char *format, ...); +void netdev_notice(const struct net_device *dev, const char *format, ...); __printf(2, 3) -int netdev_info(const struct net_device *dev, const char *format, ...); +void netdev_info(const struct net_device *dev, const char *format, ...); #define MODULE_ALIAS_NETDEV(device) \ MODULE_ALIAS("netdev-" device) @@ -3645,7 +3645,6 @@ do { \ ({ \ if (0) \ netdev_printk(KERN_DEBUG, __dev, format, ##args); \ - 0; \ }) #endif diff --git a/net/core/dev.c b/net/core/dev.c index e2ced01c01e4..e55c546717d4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7066,53 +7066,45 @@ const char *netdev_drivername(const struct net_device *dev) return empty; } -static int __netdev_printk(const char *level, const struct net_device *dev, - struct va_format *vaf) +static void __netdev_printk(const char *level, const struct net_device *dev, + struct va_format *vaf) { - int r; - if (dev && dev->dev.parent) { - r = dev_printk_emit(level[1] - '0', - dev->dev.parent, - "%s %s %s%s: %pV", - dev_driver_string(dev->dev.parent), - dev_name(dev->dev.parent), - netdev_name(dev), netdev_reg_state(dev), - vaf); + dev_printk_emit(level[1] - '0', + dev->dev.parent, + "%s %s %s%s: %pV", + dev_driver_string(dev->dev.parent), + dev_name(dev->dev.parent), + netdev_name(dev), netdev_reg_state(dev), + vaf); } else if (dev) { - r = printk("%s%s%s: %pV", level, netdev_name(dev), - netdev_reg_state(dev), vaf); + printk("%s%s%s: %pV", + level, netdev_name(dev), netdev_reg_state(dev), vaf); } else { - r = printk("%s(NULL net_device): %pV", level, vaf); + printk("%s(NULL net_device): %pV", level, vaf); } - - return r; } -int netdev_printk(const char *level, const struct net_device *dev, - const char *format, ...) +void netdev_printk(const char *level, const struct net_device *dev, + const char *format, ...) { struct va_format vaf; va_list args; - int r; va_start(args, format); vaf.fmt = format; vaf.va = &args; - r = __netdev_printk(level, dev, &vaf); + __netdev_printk(level, dev, &vaf); va_end(args); - - return r; } EXPORT_SYMBOL(netdev_printk); #define define_netdev_printk_level(func, level) \ -int func(const struct net_device *dev, const char *fmt, ...) \ +void func(const struct net_device *dev, const char *fmt, ...) \ { \ - int r; \ struct va_format vaf; \ va_list args; \ \ @@ -7121,11 +7113,9 @@ int func(const struct net_device *dev, const char *fmt, ...) \ vaf.fmt = fmt; \ vaf.va = &args; \ \ - r = __netdev_printk(level, dev, &vaf); \ + __netdev_printk(level, dev, &vaf); \ \ va_end(args); \ - \ - return r; \ } \ EXPORT_SYMBOL(func); -- cgit v1.2.3-59-g8ed1b From f4a775d14489a801a5b8b0540e23ab82e2703091 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Sep 2014 16:29:32 -0700 Subject: net: introduce __skb_header_release() While profiling TCP stack, I noticed one useless atomic operation in tcp_sendmsg(), caused by skb_header_release(). It turns out all current skb_header_release() users have a fresh skb, that no other user can see, so we can avoid one atomic operation. Introduce __skb_header_release() to clearly document this. This gave me a 1.5 % improvement on TCP_RR workload. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 15 +++++++++++++++ net/core/skbuff.c | 4 ++-- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_output.c | 10 +++++----- 4 files changed, 24 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f1bfa3781c75..8eaa62400fca 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1083,6 +1083,7 @@ static inline int skb_header_cloned(const struct sk_buff *skb) * Drop a reference to the header part of the buffer. This is done * by acquiring a payload reference. You must not read from the header * part of skb->data after this. + * Note : Check if you can use __skb_header_release() instead. */ static inline void skb_header_release(struct sk_buff *skb) { @@ -1091,6 +1092,20 @@ static inline void skb_header_release(struct sk_buff *skb) atomic_add(1 << SKB_DATAREF_SHIFT, &skb_shinfo(skb)->dataref); } +/** + * __skb_header_release - release reference to header + * @skb: buffer to operate on + * + * Variant of skb_header_release() assuming skb is private to caller. + * We can avoid one atomic operation. + */ +static inline void __skb_header_release(struct sk_buff *skb) +{ + skb->nohdr = 1; + atomic_set(&skb_shinfo(skb)->dataref, 1 + (1 << SKB_DATAREF_SHIFT)); +} + + /** * skb_shared - is the buffer shared * @skb: buffer to check diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 06a8feb10099..512dc7dcbc32 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3179,7 +3179,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) skb_shinfo(nskb)->frag_list = p; skb_shinfo(nskb)->gso_size = pinfo->gso_size; pinfo->gso_size = 0; - skb_header_release(p); + __skb_header_release(p); NAPI_GRO_CB(nskb)->last = p; nskb->data_len += p->len; @@ -3211,7 +3211,7 @@ merge: else NAPI_GRO_CB(p)->last->next = skb; NAPI_GRO_CB(p)->last = skb; - skb_header_release(skb); + __skb_header_release(skb); lp = p; done: diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 070aeff1b131..553b01f52f71 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -609,7 +609,7 @@ static inline bool forced_push(const struct tcp_sock *tp) return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); } -static inline void skb_entail(struct sock *sk, struct sk_buff *skb) +static void skb_entail(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -618,7 +618,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb) tcb->seq = tcb->end_seq = tp->write_seq; tcb->tcp_flags = TCPHDR_ACK; tcb->sacked = 0; - skb_header_release(skb); + __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8c61a7c0c889..f173b1c4f815 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -995,7 +995,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) /* Advance write_seq and place onto the write_queue. */ tp->write_seq = TCP_SKB_CB(skb)->end_seq; - skb_header_release(skb); + __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); @@ -1167,7 +1167,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, } /* Link BUFF into the send queue. */ - skb_header_release(buff); + __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk); return 0; @@ -1671,7 +1671,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, tcp_set_skb_tso_segs(sk, buff, mss_now); /* Link BUFF into the send queue. */ - skb_header_release(buff); + __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk); return 0; @@ -2772,7 +2772,7 @@ int tcp_send_synack(struct sock *sk) if (nskb == NULL) return -ENOMEM; tcp_unlink_write_queue(skb, sk); - skb_header_release(nskb); + __skb_header_release(nskb); __tcp_add_write_queue_head(sk, nskb); sk_wmem_free_skb(sk, skb); sk->sk_wmem_queued += nskb->truesize; @@ -2947,7 +2947,7 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); tcb->end_seq += skb->len; - skb_header_release(skb); + __skb_header_release(skb); __tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); -- cgit v1.2.3-59-g8ed1b From 3d9a0d2f8212879407e58d67f460d8920eb6543d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 25 Sep 2014 23:04:56 -0700 Subject: dql: dql_queued() should write first to reduce bus transactions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While doing high throughput test on a BQL enabled NIC, I found a very high cost in ndo_start_xmit() when accessing BQL data. It turned out the problem was caused by compiler trying to be smart, but involving a bad MESI transaction : 0.05 │ mov 0xc0(%rax),%edi // LOAD dql->num_queued 0.48 │ mov %edx,0xc8(%rax) // STORE dql->last_obj_cnt = count 58.23 │ add %edx,%edi 0.58 │ cmp %edi,0xc4(%rax) 0.76 │ mov %edi,0xc0(%rax) // STORE dql->num_queued += count 0.72 │ js bd8 I got an incredible 10 % gain [1] by making sure cpu do not attempt to get the cache line in Shared mode, but directly requests for ownership. New code : mov %edx,0xc8(%rax) // STORE dql->last_obj_cnt = count add %edx,0xc0(%rax) // RMW dql->num_queued += count mov 0xc4(%rax),%ecx // LOAD dql->adj_limit mov 0xc0(%rax),%edx // LOAD dql->num_queued cmp %edx,%ecx The TX completion was running from another cpu, with high interrupts rate. Note that I am using barrier() as a soft hint, as mb() here could be too heavy cost. [1] This was a netperf TCP_STREAM with TSO disabled, but GSO enabled. Signed-off-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/dynamic_queue_limits.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dynamic_queue_limits.h b/include/linux/dynamic_queue_limits.h index 5621547d631b..a4be70398ce1 100644 --- a/include/linux/dynamic_queue_limits.h +++ b/include/linux/dynamic_queue_limits.h @@ -73,14 +73,22 @@ static inline void dql_queued(struct dql *dql, unsigned int count) { BUG_ON(count > DQL_MAX_OBJECT); - dql->num_queued += count; dql->last_obj_cnt = count; + + /* We want to force a write first, so that cpu do not attempt + * to get cache line containing last_obj_cnt, num_queued, adj_limit + * in Shared state, but directly does a Request For Ownership + * It is only a hint, we use barrier() only. + */ + barrier(); + + dql->num_queued += count; } /* Returns how many objects can be queued, < 0 indicates over limit. */ static inline int dql_avail(const struct dql *dql) { - return dql->adj_limit - dql->num_queued; + return ACCESS_ONCE(dql->adj_limit) - ACCESS_ONCE(dql->num_queued); } /* Record number of completed objects and recalculate the limit. */ -- cgit v1.2.3-59-g8ed1b From b1937227316417aa7568d01e6fa1f272e98fb890 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 28 Sep 2014 22:18:47 -0700 Subject: net: reorganize sk_buff for faster __copy_skb_header() With proliferation of bit fields in sk_buff, __copy_skb_header() became quite expensive, showing as the most expensive function in a GSO workload. __copy_skb_header() performance is also critical for non GSO TCP operations, as it is used from skb_clone() This patch carefully moves all the fields that were not copied in a separate zone : cloned, nohdr, fclone, peeked, head_frag, xmit_more Then I moved all other fields and all other copied fields in a section delimited by headers_start[0]/headers_end[0] section so that we can use a single memcpy() call, inlined by compiler using long word load/stores. I also tried to make all copies in the natural orders of sk_buff, to help hardware prefetching. I made sure sk_buff size did not change. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 133 ++++++++++++++++++++++++++----------------------- net/core/skbuff.c | 80 ++++++++++++++--------------- 2 files changed, 113 insertions(+), 100 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8eaa62400fca..b6cced304b26 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -527,27 +527,41 @@ struct sk_buff { char cb[48] __aligned(8); unsigned long _skb_refdst; + void (*destructor)(struct sk_buff *skb); #ifdef CONFIG_XFRM struct sec_path *sp; +#endif +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + struct nf_conntrack *nfct; +#endif +#ifdef CONFIG_BRIDGE_NETFILTER + struct nf_bridge_info *nf_bridge; #endif unsigned int len, data_len; __u16 mac_len, hdr_len; - union { - __wsum csum; - struct { - __u16 csum_start; - __u16 csum_offset; - }; - }; - __u32 priority; + + /* Following fields are _not_ copied in __copy_skb_header() + * Note that queue_mapping is here mostly to fill a hole. + */ kmemcheck_bitfield_begin(flags1); - __u8 ignore_df:1, - cloned:1, - ip_summed:2, + __u16 queue_mapping; + __u8 cloned:1, nohdr:1, - nfctinfo:3; + fclone:2, + peeked:1, + head_frag:1, + xmit_more:1; + /* one bit hole */ + kmemcheck_bitfield_end(flags1); + + + + /* fields enclosed in headers_start/headers_end are copied + * using a single memcpy() in __copy_skb_header() + */ + __u32 headers_start[0]; /* if you move pkt_type around you also must adapt those constants */ #ifdef __BIG_ENDIAN_BITFIELD @@ -558,58 +572,53 @@ struct sk_buff { #define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset) __u8 __pkt_type_offset[0]; - __u8 pkt_type:3, - fclone:2, - ipvs_property:1, - peeked:1, - nf_trace:1; - kmemcheck_bitfield_end(flags1); - __be16 protocol; - - void (*destructor)(struct sk_buff *skb); -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) - struct nf_conntrack *nfct; -#endif -#ifdef CONFIG_BRIDGE_NETFILTER - struct nf_bridge_info *nf_bridge; -#endif - - int skb_iif; - - __u32 hash; - - __be16 vlan_proto; - __u16 vlan_tci; - -#ifdef CONFIG_NET_SCHED - __u16 tc_index; /* traffic control index */ -#ifdef CONFIG_NET_CLS_ACT - __u16 tc_verd; /* traffic control verdict */ -#endif -#endif - - __u16 queue_mapping; - kmemcheck_bitfield_begin(flags2); - __u8 xmit_more:1; -#ifdef CONFIG_IPV6_NDISC_NODETYPE - __u8 ndisc_nodetype:2; -#endif + __u8 pkt_type:3; __u8 pfmemalloc:1; + __u8 ignore_df:1; + __u8 nfctinfo:3; + + __u8 nf_trace:1; + __u8 ip_summed:2; __u8 ooo_okay:1; __u8 l4_hash:1; __u8 sw_hash:1; __u8 wifi_acked_valid:1; __u8 wifi_acked:1; + __u8 no_fcs:1; - __u8 head_frag:1; /* Indicates the inner headers are valid in the skbuff. */ __u8 encapsulation:1; __u8 encap_hdr_csum:1; __u8 csum_valid:1; __u8 csum_complete_sw:1; - /* 1/3 bit hole (depending on ndisc_nodetype presence) */ - kmemcheck_bitfield_end(flags2); + __u8 csum_level:2; + __u8 csum_bad:1; +#ifdef CONFIG_IPV6_NDISC_NODETYPE + __u8 ndisc_nodetype:2; +#endif + __u8 ipvs_property:1; + /* 5 or 7 bit hole */ + +#ifdef CONFIG_NET_SCHED + __u16 tc_index; /* traffic control index */ +#ifdef CONFIG_NET_CLS_ACT + __u16 tc_verd; /* traffic control verdict */ +#endif +#endif + + union { + __wsum csum; + struct { + __u16 csum_start; + __u16 csum_offset; + }; + }; + __u32 priority; + int skb_iif; + __u32 hash; + __be16 vlan_proto; + __u16 vlan_tci; #if defined CONFIG_NET_DMA || defined CONFIG_NET_RX_BUSY_POLL union { unsigned int napi_id; @@ -625,19 +634,18 @@ struct sk_buff { __u32 reserved_tailroom; }; - kmemcheck_bitfield_begin(flags3); - __u8 csum_level:2; - __u8 csum_bad:1; - /* 13 bit hole */ - kmemcheck_bitfield_end(flags3); - __be16 inner_protocol; __u16 inner_transport_header; __u16 inner_network_header; __u16 inner_mac_header; + + __be16 protocol; __u16 transport_header; __u16 network_header; __u16 mac_header; + + __u32 headers_end[0]; + /* These elements must be at the end, see alloc_skb() for details. */ sk_buff_data_t tail; sk_buff_data_t end; @@ -3040,19 +3048,22 @@ static inline void nf_reset_trace(struct sk_buff *skb) } /* Note: This doesn't put any conntrack and bridge info in dst. */ -static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src) +static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src, + bool copy) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) dst->nfct = src->nfct; nf_conntrack_get(src->nfct); - dst->nfctinfo = src->nfctinfo; + if (copy) + dst->nfctinfo = src->nfctinfo; #endif #ifdef CONFIG_BRIDGE_NETFILTER dst->nf_bridge = src->nf_bridge; nf_bridge_get(src->nf_bridge); #endif #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES) - dst->nf_trace = src->nf_trace; + if (copy) + dst->nf_trace = src->nf_trace; #endif } @@ -3064,7 +3075,7 @@ static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src) #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(dst->nf_bridge); #endif - __nf_copy(dst, src); + __nf_copy(dst, src, true); } #ifdef CONFIG_NETWORK_SECMARK diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d4fdc649112c..4be570a4ab21 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -261,7 +261,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, atomic_t *fclone_ref = (atomic_t *) (child + 1); kmemcheck_annotate_bitfield(child, flags1); - kmemcheck_annotate_bitfield(child, flags2); skb->fclone = SKB_FCLONE_ORIG; atomic_set(fclone_ref, 1); @@ -675,57 +674,61 @@ void consume_skb(struct sk_buff *skb) } EXPORT_SYMBOL(consume_skb); +/* Make sure a field is enclosed inside headers_start/headers_end section */ +#define CHECK_SKB_FIELD(field) \ + BUILD_BUG_ON(offsetof(struct sk_buff, field) < \ + offsetof(struct sk_buff, headers_start)); \ + BUILD_BUG_ON(offsetof(struct sk_buff, field) > \ + offsetof(struct sk_buff, headers_end)); \ + static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { new->tstamp = old->tstamp; + /* We do not copy old->sk */ new->dev = old->dev; - new->transport_header = old->transport_header; - new->network_header = old->network_header; - new->mac_header = old->mac_header; - new->inner_protocol = old->inner_protocol; - new->inner_transport_header = old->inner_transport_header; - new->inner_network_header = old->inner_network_header; - new->inner_mac_header = old->inner_mac_header; + memcpy(new->cb, old->cb, sizeof(old->cb)); skb_dst_copy(new, old); - skb_copy_hash(new, old); - new->ooo_okay = old->ooo_okay; - new->no_fcs = old->no_fcs; - new->encapsulation = old->encapsulation; - new->encap_hdr_csum = old->encap_hdr_csum; - new->csum_valid = old->csum_valid; - new->csum_complete_sw = old->csum_complete_sw; #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); #endif - memcpy(new->cb, old->cb, sizeof(old->cb)); - new->csum = old->csum; - new->ignore_df = old->ignore_df; - new->pkt_type = old->pkt_type; - new->ip_summed = old->ip_summed; - skb_copy_queue_mapping(new, old); - new->priority = old->priority; -#if IS_ENABLED(CONFIG_IP_VS) - new->ipvs_property = old->ipvs_property; + __nf_copy(new, old, false); + + /* Note : this field could be in headers_start/headers_end section + * It is not yet because we do not want to have a 16 bit hole + */ + new->queue_mapping = old->queue_mapping; + + memcpy(&new->headers_start, &old->headers_start, + offsetof(struct sk_buff, headers_end) - + offsetof(struct sk_buff, headers_start)); + CHECK_SKB_FIELD(protocol); + CHECK_SKB_FIELD(csum); + CHECK_SKB_FIELD(hash); + CHECK_SKB_FIELD(priority); + CHECK_SKB_FIELD(skb_iif); + CHECK_SKB_FIELD(vlan_proto); + CHECK_SKB_FIELD(vlan_tci); + CHECK_SKB_FIELD(transport_header); + CHECK_SKB_FIELD(network_header); + CHECK_SKB_FIELD(mac_header); + CHECK_SKB_FIELD(inner_protocol); + CHECK_SKB_FIELD(inner_transport_header); + CHECK_SKB_FIELD(inner_network_header); + CHECK_SKB_FIELD(inner_mac_header); + CHECK_SKB_FIELD(mark); +#ifdef CONFIG_NETWORK_SECMARK + CHECK_SKB_FIELD(secmark); +#endif +#ifdef CONFIG_NET_RX_BUSY_POLL + CHECK_SKB_FIELD(napi_id); #endif - new->pfmemalloc = old->pfmemalloc; - new->protocol = old->protocol; - new->mark = old->mark; - new->skb_iif = old->skb_iif; - __nf_copy(new, old); #ifdef CONFIG_NET_SCHED - new->tc_index = old->tc_index; + CHECK_SKB_FIELD(tc_index); #ifdef CONFIG_NET_CLS_ACT - new->tc_verd = old->tc_verd; + CHECK_SKB_FIELD(tc_verd); #endif #endif - new->vlan_proto = old->vlan_proto; - new->vlan_tci = old->vlan_tci; - - skb_copy_secmark(new, old); -#ifdef CONFIG_NET_RX_BUSY_POLL - new->napi_id = old->napi_id; -#endif } /* @@ -876,7 +879,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) return NULL; kmemcheck_annotate_bitfield(n, flags1); - kmemcheck_annotate_bitfield(n, flags2); n->fclone = SKB_FCLONE_UNAVAILABLE; } -- cgit v1.2.3-59-g8ed1b From 8c14f9c70327a6fb75534c4c61d7ea9c82ccf78f Mon Sep 17 00:00:00 2001 From: Michael Grzeschik Date: Mon, 29 Sep 2014 11:55:36 +0200 Subject: ARCNET: add com20020 PCI IDs with metadata This patch adds metadata for the com20020 to prepare for devices with multiple io address areas with multi card interfaces. Signed-off-by: Michael Grzeschik Signed-off-by: David S. Miller --- drivers/net/arcnet/com20020-pci.c | 216 ++++++++++++++++++++++++++++++++------ include/linux/com20020.h | 16 +++ 2 files changed, 197 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/arcnet/com20020-pci.c b/drivers/net/arcnet/com20020-pci.c index 7bb292e59559..f9e55527739d 100644 --- a/drivers/net/arcnet/com20020-pci.c +++ b/drivers/net/arcnet/com20020-pci.c @@ -63,6 +63,8 @@ MODULE_LICENSE("GPL"); static int com20020pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { + struct com20020_pci_channel_map *cm; + struct com20020_pci_card_info *ci; struct net_device *dev; struct arcnet_local *lp; int ioaddr, err; @@ -75,19 +77,15 @@ static int com20020pci_probe(struct pci_dev *pdev, const struct pci_device_id *i dev->netdev_ops = &com20020_netdev_ops; + ci = (struct com20020_pci_card_info *)id->driver_data; + lp = netdev_priv(dev); pci_set_drvdata(pdev, dev); - // SOHARD needs PCI base addr 4 - if (pdev->vendor==0x10B5) { - BUGMSG(D_NORMAL, "SOHARD\n"); - ioaddr = pci_resource_start(pdev, 4); - } - else { - BUGMSG(D_NORMAL, "Contemporary Controls\n"); - ioaddr = pci_resource_start(pdev, 2); - } + cm = &ci->chan_map_tbl[0]; + BUGMSG(D_NORMAL, "%s Controls\n", ci->name); + ioaddr = pci_resource_start(pdev, cm->bar); if (!request_region(ioaddr, ARCNET_TOTAL_SIZE, "com20020-pci")) { BUGMSG(D_INIT, "IO region %xh-%xh already allocated.\n", @@ -105,7 +103,7 @@ static int com20020pci_probe(struct pci_dev *pdev, const struct pci_device_id *i dev->irq = pdev->irq; dev->dev_addr[0] = node; lp->card_name = "PCI COM20020"; - lp->card_flags = id->driver_data; + lp->card_flags = ci->flags; lp->backplane = backplane; lp->clockp = clockp & 7; lp->clockm = clockm & 3; @@ -144,32 +142,180 @@ static void com20020pci_remove(struct pci_dev *pdev) free_netdev(dev); } +static struct com20020_pci_card_info card_info_10mbit = { + .name = "ARC-PCI", + .devcount = 1, + .chan_map_tbl = { + { 2, 0x00, 0x08 }, + }, + .flags = ARC_CAN_10MBIT, +}; + +static struct com20020_pci_card_info card_info_5mbit = { + .name = "ARC-PCI", + .devcount = 1, + .chan_map_tbl = { + { 2, 0x00, 0x08 }, + }, + .flags = ARC_IS_5MBIT, +}; + +static struct com20020_pci_card_info card_info_sohard = { + .name = "PLX-PCI", + .devcount = 1, + /* SOHARD needs PCI base addr 4 */ + .chan_map_tbl = { + {4, 0x00, 0x08}, + }, + .flags = ARC_CAN_10MBIT, +}; + static const struct pci_device_id com20020pci_id_table[] = { - { 0x1571, 0xa001, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, - { 0x1571, 0xa002, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, - { 0x1571, 0xa003, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, - { 0x1571, 0xa004, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, - { 0x1571, 0xa005, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, - { 0x1571, 0xa006, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, - { 0x1571, 0xa007, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, - { 0x1571, 0xa008, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, - { 0x1571, 0xa009, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_IS_5MBIT }, - { 0x1571, 0xa00a, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_IS_5MBIT }, - { 0x1571, 0xa00b, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_IS_5MBIT }, - { 0x1571, 0xa00c, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_IS_5MBIT }, - { 0x1571, 0xa00d, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_IS_5MBIT }, - { 0x1571, 0xa00e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_IS_5MBIT }, - { 0x1571, 0xa201, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_CAN_10MBIT }, - { 0x1571, 0xa202, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_CAN_10MBIT }, - { 0x1571, 0xa203, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_CAN_10MBIT }, - { 0x1571, 0xa204, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_CAN_10MBIT }, - { 0x1571, 0xa205, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_CAN_10MBIT }, - { 0x1571, 0xa206, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_CAN_10MBIT }, - { 0x10B5, 0x9030, 0x10B5, 0x2978, 0, 0, ARC_CAN_10MBIT }, - { 0x10B5, 0x9050, 0x10B5, 0x2273, 0, 0, ARC_CAN_10MBIT }, - { 0x14BA, 0x6000, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_CAN_10MBIT }, - { 0x10B5, 0x2200, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ARC_CAN_10MBIT }, - {0,} + { + 0x1571, 0xa001, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + 0, + }, + { + 0x1571, 0xa002, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + 0, + }, + { + 0x1571, 0xa003, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + 0 + }, + { + 0x1571, 0xa004, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + 0, + }, + { + 0x1571, 0xa005, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + 0 + }, + { + 0x1571, 0xa006, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + 0 + }, + { + 0x1571, 0xa007, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + 0 + }, + { + 0x1571, 0xa008, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + 0 + }, + { + 0x1571, 0xa009, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_5mbit + }, + { + 0x1571, 0xa00a, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_5mbit + }, + { + 0x1571, 0xa00b, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_5mbit + }, + { + 0x1571, 0xa00c, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_5mbit + }, + { + 0x1571, 0xa00d, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_5mbit + }, + { + 0x1571, 0xa00e, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_5mbit + }, + { + 0x1571, 0xa201, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_10mbit + }, + { + 0x1571, 0xa202, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_10mbit + }, + { + 0x1571, 0xa203, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_10mbit + }, + { + 0x1571, 0xa204, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_10mbit + }, + { + 0x1571, 0xa205, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_10mbit + }, + { + 0x1571, 0xa206, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_10mbit + }, + { + 0x10B5, 0x9030, + 0x10B5, 0x2978, + 0, 0, + (kernel_ulong_t)&card_info_sohard + }, + { + 0x10B5, 0x9050, + 0x10B5, 0x2273, + 0, 0, + (kernel_ulong_t)&card_info_sohard + }, + { + 0x14BA, 0x6000, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_10mbit + }, + { + 0x10B5, 0x2200, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&card_info_10mbit + }, + { 0, } }; MODULE_DEVICE_TABLE(pci, com20020pci_id_table); diff --git a/include/linux/com20020.h b/include/linux/com20020.h index 5dcfb944b6ce..6a1ceca61e7f 100644 --- a/include/linux/com20020.h +++ b/include/linux/com20020.h @@ -41,6 +41,22 @@ extern const struct net_device_ops com20020_netdev_ops; #define BUS_ALIGN 1 #endif +#define PLX_PCI_MAX_CARDS 1 + +struct com20020_pci_channel_map { + u32 bar; + u32 offset; + u32 size; /* 0x00 - auto, e.g. length of entire bar */ +}; + +struct com20020_pci_card_info { + const char *name; + int devcount; + + struct com20020_pci_channel_map chan_map_tbl[PLX_PCI_MAX_CARDS]; + + unsigned int flags; +}; #define _INTMASK (ioaddr+BUS_ALIGN*0) /* writable */ #define _STATUS (ioaddr+BUS_ALIGN*0) /* readable */ -- cgit v1.2.3-59-g8ed1b From c51da42a6346c0c747e70a4f5ae873da1150a784 Mon Sep 17 00:00:00 2001 From: Michael Grzeschik Date: Mon, 29 Sep 2014 11:55:37 +0200 Subject: ARCNET: add support for multi interfaces on com20020 The com20020-pci driver is currently designed to instance one netdev with one pci device. This patch adds support to instance many cards with one pci device, depending on the device data in the private data. Signed-off-by: Michael Grzeschik Signed-off-by: David S. Miller --- drivers/net/arcnet/com20020-pci.c | 149 ++++++++++++++++++++++++-------------- drivers/net/arcnet/com20020_cs.c | 4 - include/linux/com20020.h | 15 +++- 3 files changed, 109 insertions(+), 59 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/arcnet/com20020-pci.c b/drivers/net/arcnet/com20020-pci.c index f9e55527739d..fe87576bae3c 100644 --- a/drivers/net/arcnet/com20020-pci.c +++ b/drivers/net/arcnet/com20020-pci.c @@ -38,6 +38,7 @@ #include #include #include +#include #include @@ -61,85 +62,125 @@ module_param(clockp, int, 0); module_param(clockm, int, 0); MODULE_LICENSE("GPL"); +static void com20020pci_remove(struct pci_dev *pdev); + static int com20020pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { - struct com20020_pci_channel_map *cm; struct com20020_pci_card_info *ci; struct net_device *dev; struct arcnet_local *lp; - int ioaddr, err; + struct com20020_priv *priv; + int i, ioaddr, ret; + struct resource *r; if (pci_enable_device(pdev)) return -EIO; - dev = alloc_arcdev(device); - if (!dev) - return -ENOMEM; - - dev->netdev_ops = &com20020_netdev_ops; + priv = devm_kzalloc(&pdev->dev, sizeof(struct com20020_priv), + GFP_KERNEL); ci = (struct com20020_pci_card_info *)id->driver_data; + priv->ci = ci; - lp = netdev_priv(dev); + INIT_LIST_HEAD(&priv->list_dev); - pci_set_drvdata(pdev, dev); - cm = &ci->chan_map_tbl[0]; - BUGMSG(D_NORMAL, "%s Controls\n", ci->name); - ioaddr = pci_resource_start(pdev, cm->bar); + for (i = 0; i < ci->devcount; i++) { + struct com20020_pci_channel_map *cm = &ci->chan_map_tbl[i]; + struct com20020_dev *card; - if (!request_region(ioaddr, ARCNET_TOTAL_SIZE, "com20020-pci")) { - BUGMSG(D_INIT, "IO region %xh-%xh already allocated.\n", - ioaddr, ioaddr + ARCNET_TOTAL_SIZE - 1); - err = -EBUSY; - goto out_dev; - } + dev = alloc_arcdev(device); + if (!dev) { + ret = -ENOMEM; + goto out_port; + } - // Dummy access after Reset - // ARCNET controller needs this access to detect bustype - outb(0x00,ioaddr+1); - inb(ioaddr+1); - - dev->base_addr = ioaddr; - dev->irq = pdev->irq; - dev->dev_addr[0] = node; - lp->card_name = "PCI COM20020"; - lp->card_flags = ci->flags; - lp->backplane = backplane; - lp->clockp = clockp & 7; - lp->clockm = clockm & 3; - lp->timeout = timeout; - lp->hw.owner = THIS_MODULE; - - if (ASTATUS() == 0xFF) { - BUGMSG(D_NORMAL, "IO address %Xh was reported by PCI BIOS, " - "but seems empty!\n", ioaddr); - err = -EIO; - goto out_port; - } - if (com20020_check(dev)) { - err = -EIO; - goto out_port; + dev->netdev_ops = &com20020_netdev_ops; + + lp = netdev_priv(dev); + + BUGMSG(D_NORMAL, "%s Controls\n", ci->name); + ioaddr = pci_resource_start(pdev, cm->bar) + cm->offset; + + r = devm_request_region(&pdev->dev, ioaddr, cm->size, + "com20020-pci"); + if (!r) { + pr_err("IO region %xh-%xh already allocated.\n", + ioaddr, ioaddr + cm->size - 1); + ret = -EBUSY; + goto out_port; + } + + /* Dummy access after Reset + * ARCNET controller needs + * this access to detect bustype + */ + outb(0x00, ioaddr + 1); + inb(ioaddr + 1); + + dev->base_addr = ioaddr; + dev->dev_addr[0] = node; + dev->irq = pdev->irq; + lp->card_name = "PCI COM20020"; + lp->card_flags = ci->flags; + lp->backplane = backplane; + lp->clockp = clockp & 7; + lp->clockm = clockm & 3; + lp->timeout = timeout; + lp->hw.owner = THIS_MODULE; + + if (ASTATUS() == 0xFF) { + pr_err("IO address %Xh is empty!\n", ioaddr); + ret = -EIO; + goto out_port; + } + if (com20020_check(dev)) { + ret = -EIO; + goto out_port; + } + + card = devm_kzalloc(&pdev->dev, sizeof(struct com20020_dev), + GFP_KERNEL); + if (!card) { + pr_err("%s out of memory!\n", __func__); + return -ENOMEM; + } + + card->index = i; + card->pci_priv = priv; + card->dev = dev; + + dev_set_drvdata(&dev->dev, card); + + ret = com20020_found(dev, IRQF_SHARED); + if (ret) + goto out_port; + + list_add(&card->list, &priv->list_dev); } - if ((err = com20020_found(dev, IRQF_SHARED)) != 0) - goto out_port; + pci_set_drvdata(pdev, priv); return 0; out_port: - release_region(ioaddr, ARCNET_TOTAL_SIZE); -out_dev: - free_netdev(dev); - return err; + com20020pci_remove(pdev); + return ret; } static void com20020pci_remove(struct pci_dev *pdev) { - struct net_device *dev = pci_get_drvdata(pdev); - unregister_netdev(dev); - free_irq(dev->irq, dev); - release_region(dev->base_addr, ARCNET_TOTAL_SIZE); - free_netdev(dev); + struct com20020_dev *card, *tmpcard; + struct com20020_priv *priv; + + priv = pci_get_drvdata(pdev); + + list_for_each_entry_safe(card, tmpcard, &priv->list_dev, list) { + struct net_device *dev = card->dev; + + unregister_netdev(dev); + free_irq(dev->irq, dev); + free_netdev(dev); + } } static struct com20020_pci_card_info card_info_10mbit = { diff --git a/drivers/net/arcnet/com20020_cs.c b/drivers/net/arcnet/com20020_cs.c index 1a790a20210d..057d9582132a 100644 --- a/drivers/net/arcnet/com20020_cs.c +++ b/drivers/net/arcnet/com20020_cs.c @@ -112,10 +112,6 @@ static void com20020_detach(struct pcmcia_device *p_dev); /*====================================================================*/ -struct com20020_dev { - struct net_device *dev; -}; - static int com20020_probe(struct pcmcia_device *p_dev) { struct com20020_dev *info; diff --git a/include/linux/com20020.h b/include/linux/com20020.h index 6a1ceca61e7f..85898995b234 100644 --- a/include/linux/com20020.h +++ b/include/linux/com20020.h @@ -41,7 +41,7 @@ extern const struct net_device_ops com20020_netdev_ops; #define BUS_ALIGN 1 #endif -#define PLX_PCI_MAX_CARDS 1 +#define PLX_PCI_MAX_CARDS 2 struct com20020_pci_channel_map { u32 bar; @@ -58,6 +58,19 @@ struct com20020_pci_card_info { unsigned int flags; }; +struct com20020_priv { + struct com20020_pci_card_info *ci; + struct list_head list_dev; +}; + +struct com20020_dev { + struct list_head list; + struct net_device *dev; + + struct com20020_priv *pci_priv; + int index; +}; + #define _INTMASK (ioaddr+BUS_ALIGN*0) /* writable */ #define _STATUS (ioaddr+BUS_ALIGN*0) /* readable */ #define _COMMAND (ioaddr+BUS_ALIGN*1) /* standard arcnet commands */ -- cgit v1.2.3-59-g8ed1b From 79cf79abce71eb7dbc40e2f3121048ca5405cb47 Mon Sep 17 00:00:00 2001 From: Michael Braun Date: Thu, 25 Sep 2014 16:31:08 +0200 Subject: macvlan: add source mode This patch adds a new mode of operation to macvlan, called "source". It allows one to set a list of allowed mac address, which is used to match against source mac address from received frames on underlying interface. This enables creating mac based VLAN associations, instead of standard port or tag based. The feature is useful to deploy 802.1x mac based behavior, where drivers of underlying interfaces doesn't allows that. Configuration is done through the netlink interface using e.g.: ip link add link eth0 name macvlan0 type macvlan mode source ip link add link eth0 name macvlan1 type macvlan mode source ip link set link dev macvlan0 type macvlan macaddr add 00:11:11:11:11:11 ip link set link dev macvlan0 type macvlan macaddr add 00:22:22:22:22:22 ip link set link dev macvlan0 type macvlan macaddr add 00:33:33:33:33:33 ip link set link dev macvlan1 type macvlan macaddr add 00:33:33:33:33:33 ip link set link dev macvlan1 type macvlan macaddr add 00:44:44:44:44:44 This allows clients with MAC addresses 00:11:11:11:11:11, 00:22:22:22:22:22 to be part of only VLAN associated with macvlan0 interface. Clients with MAC addresses 00:44:44:44:44:44 with only VLAN associated with macvlan1 interface. And client with MAC address 00:33:33:33:33:33 to be associated with both VLANs. Based on work of Stefan Gula v8: last version of Stefan Gula for Kernel 3.2.1 v9: rework onto linux-next 2014-03-12 by Michael Braun add MACADDR_SET command, enable to configure mac for source mode while creating interface v10: - reduce indention level - rename source_list to source_entry - use aligned 64bit ether address - use hash_64 instead of addr[5] v11: - rebase for 3.14 / linux-next 20.04.2014 v12 - rebase for linux-next 2014-09-25 Signed-off-by: Michael Braun Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 304 ++++++++++++++++++++++++++++++++++++++++++- include/linux/if_macvlan.h | 1 + include/uapi/linux/if_link.h | 12 ++ 3 files changed, 314 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 726edabff26b..e8a453f1b458 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -35,7 +35,8 @@ #include #include -#define MACVLAN_HASH_SIZE (1 << BITS_PER_BYTE) +#define MACVLAN_HASH_BITS 8 +#define MACVLAN_HASH_SIZE (1<>= 16; +#else + value <<= 16; +#endif + return hash_64(value, MACVLAN_HASH_BITS); +} + static struct macvlan_port *macvlan_port_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); @@ -73,20 +96,68 @@ static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port, const unsigned char *addr) { struct macvlan_dev *vlan; + u32 idx = macvlan_eth_hash(addr); - hlist_for_each_entry_rcu(vlan, &port->vlan_hash[addr[5]], hlist) { + hlist_for_each_entry_rcu(vlan, &port->vlan_hash[idx], hlist) { if (ether_addr_equal_64bits(vlan->dev->dev_addr, addr)) return vlan; } return NULL; } +static struct macvlan_source_entry *macvlan_hash_lookup_source( + const struct macvlan_dev *vlan, + const unsigned char *addr) +{ + struct macvlan_source_entry *entry; + u32 idx = macvlan_eth_hash(addr); + struct hlist_head *h = &vlan->port->vlan_source_hash[idx]; + + hlist_for_each_entry_rcu(entry, h, hlist) { + if (ether_addr_equal_64bits(entry->addr, addr) && + entry->vlan == vlan) + return entry; + } + return NULL; +} + +static int macvlan_hash_add_source(struct macvlan_dev *vlan, + const unsigned char *addr) +{ + struct macvlan_port *port = vlan->port; + struct macvlan_source_entry *entry; + struct hlist_head *h; + + entry = macvlan_hash_lookup_source(vlan, addr); + if (entry) + return 0; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + ether_addr_copy(entry->addr, addr); + entry->vlan = vlan; + h = &port->vlan_source_hash[macvlan_eth_hash(addr)]; + hlist_add_head_rcu(&entry->hlist, h); + vlan->macaddr_count++; + + return 0; +} + static void macvlan_hash_add(struct macvlan_dev *vlan) { struct macvlan_port *port = vlan->port; const unsigned char *addr = vlan->dev->dev_addr; + u32 idx = macvlan_eth_hash(addr); - hlist_add_head_rcu(&vlan->hlist, &port->vlan_hash[addr[5]]); + hlist_add_head_rcu(&vlan->hlist, &port->vlan_hash[idx]); +} + +static void macvlan_hash_del_source(struct macvlan_source_entry *entry) +{ + hlist_del_rcu(&entry->hlist); + kfree_rcu(entry, rcu); } static void macvlan_hash_del(struct macvlan_dev *vlan, bool sync) @@ -267,6 +338,65 @@ err: atomic_long_inc(&skb->dev->rx_dropped); } +static void macvlan_flush_sources(struct macvlan_port *port, + struct macvlan_dev *vlan) +{ + int i; + + for (i = 0; i < MACVLAN_HASH_SIZE; i++) { + struct hlist_node *h, *n; + + hlist_for_each_safe(h, n, &port->vlan_source_hash[i]) { + struct macvlan_source_entry *entry; + + entry = hlist_entry(h, struct macvlan_source_entry, + hlist); + if (entry->vlan == vlan) + macvlan_hash_del_source(entry); + } + } + vlan->macaddr_count = 0; +} + +static void macvlan_forward_source_one(struct sk_buff *skb, + struct macvlan_dev *vlan) +{ + struct sk_buff *nskb; + struct net_device *dev; + int len; + int ret; + + dev = vlan->dev; + if (unlikely(!(dev->flags & IFF_UP))) + return; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return; + + len = nskb->len + ETH_HLEN; + nskb->dev = dev; + nskb->pkt_type = PACKET_HOST; + + ret = netif_rx(nskb); + macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, 0); +} + +static void macvlan_forward_source(struct sk_buff *skb, + struct macvlan_port *port, + const unsigned char *addr) +{ + struct macvlan_source_entry *entry; + u32 idx = macvlan_eth_hash(addr); + struct hlist_head *h = &port->vlan_source_hash[idx]; + + hlist_for_each_entry_rcu(entry, h, hlist) { + if (ether_addr_equal_64bits(entry->addr, addr)) + if (entry->vlan->dev->flags & IFF_UP) + macvlan_forward_source_one(skb, entry->vlan); + } +} + /* called under rcu_read_lock() from netif_receive_skb */ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) { @@ -285,6 +415,7 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) if (!skb) return RX_HANDLER_CONSUMED; eth = eth_hdr(skb); + macvlan_forward_source(skb, port, eth->h_source); src = macvlan_hash_lookup(port, eth->h_source); if (src && src->mode != MACVLAN_MODE_VEPA && src->mode != MACVLAN_MODE_BRIDGE) { @@ -301,6 +432,7 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) return RX_HANDLER_PASS; } + macvlan_forward_source(skb, port, eth->h_source); if (port->passthru) vlan = list_first_or_null_rcu(&port->vlans, struct macvlan_dev, list); @@ -667,6 +799,7 @@ static void macvlan_uninit(struct net_device *dev) free_percpu(vlan->pcpu_stats); + macvlan_flush_sources(port, vlan); port->count -= 1; if (!port->count) macvlan_port_destroy(port->dev); @@ -925,6 +1058,8 @@ static int macvlan_port_create(struct net_device *dev) INIT_LIST_HEAD(&port->vlans); for (i = 0; i < MACVLAN_HASH_SIZE; i++) INIT_HLIST_HEAD(&port->vlan_hash[i]); + for (i = 0; i < MACVLAN_HASH_SIZE; i++) + INIT_HLIST_HEAD(&port->vlan_source_hash[i]); skb_queue_head_init(&port->bc_queue); INIT_WORK(&port->bc_work, macvlan_process_broadcast); @@ -966,11 +1101,102 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[]) case MACVLAN_MODE_VEPA: case MACVLAN_MODE_BRIDGE: case MACVLAN_MODE_PASSTHRU: + case MACVLAN_MODE_SOURCE: + break; + default: + return -EINVAL; + } + } + + if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { + switch (nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE])) { + case MACVLAN_MACADDR_ADD: + case MACVLAN_MACADDR_DEL: + case MACVLAN_MACADDR_FLUSH: + case MACVLAN_MACADDR_SET: break; default: return -EINVAL; } } + + if (data && data[IFLA_MACVLAN_MACADDR]) { + if (nla_len(data[IFLA_MACVLAN_MACADDR]) != ETH_ALEN) + return -EINVAL; + + if (!is_valid_ether_addr(nla_data(data[IFLA_MACVLAN_MACADDR]))) + return -EADDRNOTAVAIL; + } + + if (data && data[IFLA_MACVLAN_MACADDR_COUNT]) + return -EINVAL; + + return 0; +} + +/** + * reconfigure list of remote source mac address + * (only for macvlan devices in source mode) + * Note regarding alignment: all netlink data is aligned to 4 Byte, which + * suffices for both ether_addr_copy and ether_addr_equal_64bits usage. + */ +static int macvlan_changelink_sources(struct macvlan_dev *vlan, u32 mode, + struct nlattr *data[]) +{ + char *addr = NULL; + int ret, rem, len; + struct nlattr *nla, *head; + struct macvlan_source_entry *entry; + + if (data[IFLA_MACVLAN_MACADDR]) + addr = nla_data(data[IFLA_MACVLAN_MACADDR]); + + if (mode == MACVLAN_MACADDR_ADD) { + if (!addr) + return -EINVAL; + + return macvlan_hash_add_source(vlan, addr); + + } else if (mode == MACVLAN_MACADDR_DEL) { + if (!addr) + return -EINVAL; + + entry = macvlan_hash_lookup_source(vlan, addr); + if (entry) { + macvlan_hash_del_source(entry); + vlan->macaddr_count--; + } + } else if (mode == MACVLAN_MACADDR_FLUSH) { + macvlan_flush_sources(vlan->port, vlan); + } else if (mode == MACVLAN_MACADDR_SET) { + macvlan_flush_sources(vlan->port, vlan); + + if (addr) { + ret = macvlan_hash_add_source(vlan, addr); + if (ret) + return ret; + } + + if (!data || !data[IFLA_MACVLAN_MACADDR_DATA]) + return 0; + + head = nla_data(data[IFLA_MACVLAN_MACADDR_DATA]); + len = nla_len(data[IFLA_MACVLAN_MACADDR_DATA]); + + nla_for_each_attr(nla, head, len, rem) { + if (nla_type(nla) != IFLA_MACVLAN_MACADDR || + nla_len(nla) != ETH_ALEN) + continue; + + addr = nla_data(nla); + ret = macvlan_hash_add_source(vlan, addr); + if (ret) + return ret; + } + } else { + return -EINVAL; + } + return 0; } @@ -981,6 +1207,7 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev, struct macvlan_port *port; struct net_device *lowerdev; int err; + int macmode; if (!tb[IFLA_LINK]) return -EINVAL; @@ -1034,6 +1261,15 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev, eth_hw_addr_inherit(dev, lowerdev); } + if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { + if (vlan->mode != MACVLAN_MODE_SOURCE) + return -EINVAL; + macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]); + err = macvlan_changelink_sources(vlan, macmode, data); + if (err) + return err; + } + port->count += 1; err = register_netdevice(dev); if (err < 0) @@ -1070,6 +1306,8 @@ void macvlan_dellink(struct net_device *dev, struct list_head *head) { struct macvlan_dev *vlan = netdev_priv(dev); + if (vlan->mode == MACVLAN_MODE_SOURCE) + macvlan_flush_sources(vlan->port, vlan); list_del_rcu(&vlan->list); unregister_netdevice_queue(dev, head); netdev_upper_dev_unlink(vlan->lowerdev, dev); @@ -1082,6 +1320,8 @@ static int macvlan_changelink(struct net_device *dev, struct macvlan_dev *vlan = netdev_priv(dev); enum macvlan_mode mode; bool set_mode = false; + enum macvlan_macaddr_mode macmode; + int ret; /* Validate mode, but don't set yet: setting flags may fail. */ if (data && data[IFLA_MACVLAN_MODE]) { @@ -1091,6 +1331,9 @@ static int macvlan_changelink(struct net_device *dev, if ((mode == MACVLAN_MODE_PASSTHRU) != (vlan->mode == MACVLAN_MODE_PASSTHRU)) return -EINVAL; + if (vlan->mode == MACVLAN_MODE_SOURCE && + vlan->mode != mode) + macvlan_flush_sources(vlan->port, vlan); } if (data && data[IFLA_MACVLAN_FLAGS]) { @@ -1110,26 +1353,77 @@ static int macvlan_changelink(struct net_device *dev, } if (set_mode) vlan->mode = mode; + if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { + if (vlan->mode != MACVLAN_MODE_SOURCE) + return -EINVAL; + macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]); + ret = macvlan_changelink_sources(vlan, macmode, data); + if (ret) + return ret; + } return 0; } +static size_t macvlan_get_size_mac(const struct macvlan_dev *vlan) +{ + if (vlan->macaddr_count == 0) + return 0; + return nla_total_size(0) /* IFLA_MACVLAN_MACADDR_DATA */ + + vlan->macaddr_count * nla_total_size(sizeof(u8) * ETH_ALEN); +} + static size_t macvlan_get_size(const struct net_device *dev) { + struct macvlan_dev *vlan = netdev_priv(dev); + return (0 + nla_total_size(4) /* IFLA_MACVLAN_MODE */ + nla_total_size(2) /* IFLA_MACVLAN_FLAGS */ + + nla_total_size(4) /* IFLA_MACVLAN_MACADDR_COUNT */ + + macvlan_get_size_mac(vlan) /* IFLA_MACVLAN_MACADDR */ ); } +static int macvlan_fill_info_macaddr(struct sk_buff *skb, + const struct macvlan_dev *vlan, + const int i) +{ + struct hlist_head *h = &vlan->port->vlan_source_hash[i]; + struct macvlan_source_entry *entry; + + hlist_for_each_entry_rcu(entry, h, hlist) { + if (entry->vlan != vlan) + continue; + if (nla_put(skb, IFLA_MACVLAN_MACADDR, ETH_ALEN, entry->addr)) + return 1; + } + return 0; +} + static int macvlan_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); + int i; + struct nlattr *nest; if (nla_put_u32(skb, IFLA_MACVLAN_MODE, vlan->mode)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_MACVLAN_FLAGS, vlan->flags)) goto nla_put_failure; + if (nla_put_u32(skb, IFLA_MACVLAN_MACADDR_COUNT, vlan->macaddr_count)) + goto nla_put_failure; + if (vlan->macaddr_count > 0) { + nest = nla_nest_start(skb, IFLA_MACVLAN_MACADDR_DATA); + if (nest == NULL) + goto nla_put_failure; + + for (i = 0; i < MACVLAN_HASH_SIZE; i++) { + if (macvlan_fill_info_macaddr(skb, vlan, i)) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + } return 0; nla_put_failure: @@ -1139,6 +1433,10 @@ nla_put_failure: static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = { [IFLA_MACVLAN_MODE] = { .type = NLA_U32 }, [IFLA_MACVLAN_FLAGS] = { .type = NLA_U16 }, + [IFLA_MACVLAN_MACADDR_MODE] = { .type = NLA_U32 }, + [IFLA_MACVLAN_MACADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [IFLA_MACVLAN_MACADDR_DATA] = { .type = NLA_NESTED }, + [IFLA_MACVLAN_MACADDR_COUNT] = { .type = NLA_U32 }, }; int macvlan_link_register(struct rtnl_link_ops *ops) diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 6b2c7cf352a5..6f6929ea8a0c 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -60,6 +60,7 @@ struct macvlan_dev { #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *netpoll; #endif + unsigned int macaddr_count; }; static inline void macvlan_count_rx(const struct macvlan_dev *vlan, diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index c80f95f6ee78..0bdb77e16875 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -303,6 +303,10 @@ enum { IFLA_MACVLAN_UNSPEC, IFLA_MACVLAN_MODE, IFLA_MACVLAN_FLAGS, + IFLA_MACVLAN_MACADDR_MODE, + IFLA_MACVLAN_MACADDR, + IFLA_MACVLAN_MACADDR_DATA, + IFLA_MACVLAN_MACADDR_COUNT, __IFLA_MACVLAN_MAX, }; @@ -313,6 +317,14 @@ enum macvlan_mode { MACVLAN_MODE_VEPA = 2, /* talk to other ports through ext bridge */ MACVLAN_MODE_BRIDGE = 4, /* talk to bridge ports directly */ MACVLAN_MODE_PASSTHRU = 8,/* take over the underlying device */ + MACVLAN_MODE_SOURCE = 16,/* use source MAC address list to assign */ +}; + +enum macvlan_macaddr_mode { + MACVLAN_MACADDR_ADD, + MACVLAN_MACADDR_DEL, + MACVLAN_MACADDR_FLUSH, + MACVLAN_MACADDR_SET, }; #define MACVLAN_FLAG_NOPROMISC 1 -- cgit v1.2.3-59-g8ed1b From 2101e533f41a90b25bee17ce969734e26eb0eb55 Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Fri, 26 Sep 2014 00:09:19 +0200 Subject: bcma: register bcma as device tree driver This driver is used by the bcm53xx ARM SoC code. Now it is possible to give the address of the chipcommon core in device tree and bcma will search for all the other cores. Signed-off-by: Hauke Mehrtens Acked-by: Arnd Bergmann Signed-off-by: John W. Linville --- Documentation/devicetree/bindings/bus/bcma.txt | 20 +++++++ drivers/bcma/bcma_private.h | 14 +++++ drivers/bcma/host_soc.c | 81 ++++++++++++++++++++++++++ drivers/bcma/main.c | 52 ++++++++++++++++- include/linux/bcma/bcma.h | 2 + 5 files changed, 168 insertions(+), 1 deletion(-) create mode 100644 Documentation/devicetree/bindings/bus/bcma.txt (limited to 'include/linux') diff --git a/Documentation/devicetree/bindings/bus/bcma.txt b/Documentation/devicetree/bindings/bus/bcma.txt new file mode 100644 index 000000000000..e9070c161172 --- /dev/null +++ b/Documentation/devicetree/bindings/bus/bcma.txt @@ -0,0 +1,20 @@ +Driver for ARM AXI Bus with Broadcom Plugins (bcma) + +Required properties: + +- compatible : brcm,bus-axi + +- reg : iomem address range of chipcommon core + +The cores on the AXI bus are automatically detected by bcma with the +memory ranges they are using and they get registered afterwards. + +Example: + + axi@18000000 { + compatible = "brcm,bus-axi"; + reg = <0x18000000 0x1000>; + ranges = <0x00000000 0x18000000 0x00100000>; + #address-cells = <1>; + #size-cells = <1>; + }; diff --git a/drivers/bcma/bcma_private.h b/drivers/bcma/bcma_private.h index b40be43c6f31..b6412b2d748d 100644 --- a/drivers/bcma/bcma_private.h +++ b/drivers/bcma/bcma_private.h @@ -88,6 +88,20 @@ extern int __init bcma_host_pci_init(void); extern void __exit bcma_host_pci_exit(void); #endif /* CONFIG_BCMA_HOST_PCI */ +/* host_soc.c */ +#if defined(CONFIG_BCMA_HOST_SOC) && defined(CONFIG_OF) +extern int __init bcma_host_soc_register_driver(void); +extern void __exit bcma_host_soc_unregister_driver(void); +#else +static inline int __init bcma_host_soc_register_driver(void) +{ + return 0; +} +static inline void __exit bcma_host_soc_unregister_driver(void) +{ +} +#endif /* CONFIG_BCMA_HOST_SOC && CONFIG_OF */ + /* driver_pci.c */ u32 bcma_pcie_read(struct bcma_drv_pci *pc, u32 address); diff --git a/drivers/bcma/host_soc.c b/drivers/bcma/host_soc.c index 718e054dd727..335cbcfd945b 100644 --- a/drivers/bcma/host_soc.c +++ b/drivers/bcma/host_soc.c @@ -7,6 +7,9 @@ #include "bcma_private.h" #include "scan.h" +#include +#include +#include #include #include @@ -176,6 +179,7 @@ int __init bcma_host_soc_register(struct bcma_soc *soc) /* Host specific */ bus->hosttype = BCMA_HOSTTYPE_SOC; bus->ops = &bcma_host_soc_ops; + bus->host_pdev = NULL; /* Initialize struct, detect chip */ bcma_init_bus(bus); @@ -195,3 +199,80 @@ int __init bcma_host_soc_init(struct bcma_soc *soc) return err; } + +#ifdef CONFIG_OF +static int bcma_host_soc_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct device_node *np = dev->of_node; + struct bcma_bus *bus; + int err; + + /* Alloc */ + bus = devm_kzalloc(dev, sizeof(*bus), GFP_KERNEL); + if (!bus) + return -ENOMEM; + + /* Map MMIO */ + bus->mmio = of_iomap(np, 0); + if (!bus->mmio) + return -ENOMEM; + + /* Host specific */ + bus->hosttype = BCMA_HOSTTYPE_SOC; + bus->ops = &bcma_host_soc_ops; + bus->host_pdev = pdev; + + /* Initialize struct, detect chip */ + bcma_init_bus(bus); + + /* Register */ + err = bcma_bus_register(bus); + if (err) + goto err_unmap_mmio; + + platform_set_drvdata(pdev, bus); + + return err; + +err_unmap_mmio: + iounmap(bus->mmio); + return err; +} + +static int bcma_host_soc_remove(struct platform_device *pdev) +{ + struct bcma_bus *bus = platform_get_drvdata(pdev); + + bcma_bus_unregister(bus); + iounmap(bus->mmio); + platform_set_drvdata(pdev, NULL); + + return 0; +} + +static const struct of_device_id bcma_host_soc_of_match[] = { + { .compatible = "brcm,bus-axi", }, + {}, +}; +MODULE_DEVICE_TABLE(of, bcma_host_soc_of_match); + +static struct platform_driver bcma_host_soc_driver = { + .driver = { + .name = "bcma-host-soc", + .of_match_table = bcma_host_soc_of_match, + }, + .probe = bcma_host_soc_probe, + .remove = bcma_host_soc_remove, +}; + +int __init bcma_host_soc_register_driver(void) +{ + return platform_driver_register(&bcma_host_soc_driver); +} + +void __exit bcma_host_soc_unregister_driver(void) +{ + platform_driver_unregister(&bcma_host_soc_driver); +} +#endif /* CONFIG_OF */ diff --git a/drivers/bcma/main.c b/drivers/bcma/main.c index c421403cab43..d1656c2f70af 100644 --- a/drivers/bcma/main.c +++ b/drivers/bcma/main.c @@ -10,6 +10,7 @@ #include #include #include +#include MODULE_DESCRIPTION("Broadcom's specific AMBA driver"); MODULE_LICENSE("GPL"); @@ -131,6 +132,43 @@ static bool bcma_is_core_needed_early(u16 core_id) return false; } +#ifdef CONFIG_OF +static struct device_node *bcma_of_find_child_device(struct platform_device *parent, + struct bcma_device *core) +{ + struct device_node *node; + u64 size; + const __be32 *reg; + + if (!parent || !parent->dev.of_node) + return NULL; + + for_each_child_of_node(parent->dev.of_node, node) { + reg = of_get_address(node, 0, &size, NULL); + if (!reg) + continue; + if (of_translate_address(node, reg) == core->addr) + return node; + } + return NULL; +} + +static void bcma_of_fill_device(struct platform_device *parent, + struct bcma_device *core) +{ + struct device_node *node; + + node = bcma_of_find_child_device(parent, core); + if (node) + core->dev.of_node = node; +} +#else +static void bcma_of_fill_device(struct platform_device *parent, + struct bcma_device *core) +{ +} +#endif /* CONFIG_OF */ + static void bcma_register_core(struct bcma_bus *bus, struct bcma_device *core) { int err; @@ -147,7 +185,13 @@ static void bcma_register_core(struct bcma_bus *bus, struct bcma_device *core) break; case BCMA_HOSTTYPE_SOC: core->dev.dma_mask = &core->dev.coherent_dma_mask; - core->dma_dev = &core->dev; + if (bus->host_pdev) { + core->dma_dev = &bus->host_pdev->dev; + core->dev.parent = &bus->host_pdev->dev; + bcma_of_fill_device(bus->host_pdev, core); + } else { + core->dma_dev = &core->dev; + } break; case BCMA_HOSTTYPE_SDIO: break; @@ -528,6 +572,11 @@ static int __init bcma_modinit(void) if (err) return err; + err = bcma_host_soc_register_driver(); + if (err) { + pr_err("SoC host initialization failed\n"); + err = 0; + } #ifdef CONFIG_BCMA_HOST_PCI err = bcma_host_pci_init(); if (err) { @@ -545,6 +594,7 @@ static void __exit bcma_modexit(void) #ifdef CONFIG_BCMA_HOST_PCI bcma_host_pci_exit(); #endif + bcma_host_soc_unregister_driver(); bus_unregister(&bcma_bus_type); } module_exit(bcma_modexit) diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index 634597917670..729f48e6b20b 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -323,6 +323,8 @@ struct bcma_bus { struct pci_dev *host_pci; /* Pointer to the SDIO device (only for BCMA_HOSTTYPE_SDIO) */ struct sdio_func *host_sdio; + /* Pointer to platform device (only for BCMA_HOSTTYPE_SOC) */ + struct platform_device *host_pdev; }; struct bcma_chipinfo chipinfo; -- cgit v1.2.3-59-g8ed1b From e1c00e10e92c04aa637126db2e59b092bd4878f8 Mon Sep 17 00:00:00 2001 From: Majd Dibbiny Date: Tue, 30 Sep 2014 12:03:48 +0300 Subject: net/mlx4_core: New init and exit flow for mlx4_core In the new flow, we separate the pci initialization and teardown from the initialization and teardown of the other resources. __mlx4_init_one handles the pci resources initialization. It then calls mlx4_load_one to initialize the remainder of the resources. When removing a device, mlx4_remove_one is invoked. However, now mlx4_remove_one calls mlx4_unload_one to free all the resources except the pci resources. When mlx4_unload_one returns, mlx4_remove_one then frees the pci resources. The above separation will allow us to implement 'reset flow' in the future. It will also enable more EQs for VFs and is a pre-step to the modern API to enable/disable SRIOV. Also added nvfs; an integer array of size MLX4_MAX_PORTS + 1; to the mlx4_dev struct. This new field is used to avoid parsing the num_vfs module parameter each time the mlx4_restart_one is called. Signed-off-by: Majd Dibbiny Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/main.c | 393 +++++++++++++++++------------- include/linux/mlx4/device.h | 1 + 2 files changed, 220 insertions(+), 174 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 4e9857b409f3..f6c32a947185 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -2259,116 +2259,18 @@ static void mlx4_free_ownership(struct mlx4_dev *dev) iounmap(owner); } -static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) +static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data, + int total_vfs, int *nvfs, struct mlx4_priv *priv) { - struct mlx4_priv *priv; struct mlx4_dev *dev; + unsigned sum = 0; int err; int port; - int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0}; - int prb_vf[MLX4_MAX_PORTS + 1] = {0, 0, 0}; - const int param_map[MLX4_MAX_PORTS + 1][MLX4_MAX_PORTS + 1] = { - {2, 0, 0}, {0, 1, 2}, {0, 1, 2} }; - unsigned total_vfs = 0; - int sriov_initialized = 0; - unsigned int i; + int i; int existing_vfs = 0; - pr_info(DRV_NAME ": Initializing %s\n", pci_name(pdev)); - - err = pci_enable_device(pdev); - if (err) { - dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n"); - return err; - } - - /* Due to requirement that all VFs and the PF are *guaranteed* 2 MACS - * per port, we must limit the number of VFs to 63 (since their are - * 128 MACs) - */ - for (i = 0; i < sizeof(nvfs)/sizeof(nvfs[0]) && i < num_vfs_argc; - total_vfs += nvfs[param_map[num_vfs_argc - 1][i]], i++) { - nvfs[param_map[num_vfs_argc - 1][i]] = num_vfs[i]; - if (nvfs[i] < 0) { - dev_err(&pdev->dev, "num_vfs module parameter cannot be negative\n"); - return -EINVAL; - } - } - for (i = 0; i < sizeof(prb_vf)/sizeof(prb_vf[0]) && i < probe_vfs_argc; - i++) { - prb_vf[param_map[probe_vfs_argc - 1][i]] = probe_vf[i]; - if (prb_vf[i] < 0 || prb_vf[i] > nvfs[i]) { - dev_err(&pdev->dev, "probe_vf module parameter cannot be negative or greater than num_vfs\n"); - return -EINVAL; - } - } - if (total_vfs >= MLX4_MAX_NUM_VF) { - dev_err(&pdev->dev, - "Requested more VF's (%d) than allowed (%d)\n", - total_vfs, MLX4_MAX_NUM_VF - 1); - return -EINVAL; - } - - for (i = 0; i < MLX4_MAX_PORTS; i++) { - if (nvfs[i] + nvfs[2] >= MLX4_MAX_NUM_VF_P_PORT) { - dev_err(&pdev->dev, - "Requested more VF's (%d) for port (%d) than allowed (%d)\n", - nvfs[i] + nvfs[2], i + 1, - MLX4_MAX_NUM_VF_P_PORT - 1); - return -EINVAL; - } - } - - - /* - * Check for BARs. - */ - if (!(pci_dev_data & MLX4_PCI_DEV_IS_VF) && - !(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) { - dev_err(&pdev->dev, "Missing DCS, aborting (driver_data: 0x%x, pci_resource_flags(pdev, 0):0x%lx)\n", - pci_dev_data, pci_resource_flags(pdev, 0)); - err = -ENODEV; - goto err_disable_pdev; - } - if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) { - dev_err(&pdev->dev, "Missing UAR, aborting\n"); - err = -ENODEV; - goto err_disable_pdev; - } - - err = pci_request_regions(pdev, DRV_NAME); - if (err) { - dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n"); - goto err_disable_pdev; - } - - pci_set_master(pdev); + dev = &priv->dev; - err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); - if (err) { - dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask\n"); - err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); - if (err) { - dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting\n"); - goto err_release_regions; - } - } - err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); - if (err) { - dev_warn(&pdev->dev, "Warning: couldn't set 64-bit consistent PCI DMA mask\n"); - err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); - if (err) { - dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, aborting\n"); - goto err_release_regions; - } - } - - /* Allow large DMA segments, up to the firmware limit of 1 GB */ - dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024); - - dev = pci_get_drvdata(pdev); - priv = mlx4_priv(dev); - dev->pdev = pdev; INIT_LIST_HEAD(&priv->ctx_list); spin_lock_init(&priv->ctx_lock); @@ -2382,28 +2284,9 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) dev->rev_id = pdev->revision; dev->numa_node = dev_to_node(&pdev->dev); + /* Detect if this device is a virtual function */ if (pci_dev_data & MLX4_PCI_DEV_IS_VF) { - /* When acting as pf, we normally skip vfs unless explicitly - * requested to probe them. */ - if (total_vfs) { - unsigned vfs_offset = 0; - for (i = 0; i < sizeof(nvfs)/sizeof(nvfs[0]) && - vfs_offset + nvfs[i] < extended_func_num(pdev); - vfs_offset += nvfs[i], i++) - ; - if (i == sizeof(nvfs)/sizeof(nvfs[0])) { - err = -ENODEV; - goto err_free_dev; - } - if ((extended_func_num(pdev) - vfs_offset) - > prb_vf[i]) { - mlx4_warn(dev, "Skipping virtual function:%d\n", - extended_func_num(pdev)); - err = -ENODEV; - goto err_free_dev; - } - } mlx4_warn(dev, "Detected virtual function - running in slave mode\n"); dev->flags |= MLX4_FLAG_SLAVE; } else { @@ -2413,11 +2296,10 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) err = mlx4_get_ownership(dev); if (err) { if (err < 0) - goto err_free_dev; + return err; else { mlx4_warn(dev, "Multiple PFs not yet supported - Skipping PF\n"); - err = -EINVAL; - goto err_free_dev; + return -EINVAL; } } @@ -2429,7 +2311,8 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) GFP_KERNEL); if (NULL == dev->dev_vfs) { mlx4_err(dev, "Failed to allocate memory for VFs\n"); - err = 0; + err = -ENOMEM; + goto err_free_own; } else { atomic_inc(&pf_loading); existing_vfs = pci_num_vf(pdev); @@ -2445,13 +2328,11 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) mlx4_err(dev, "Failed to enable SR-IOV, continuing without SR-IOV (err = %d)\n", err); atomic_dec(&pf_loading); - err = 0; } else { mlx4_warn(dev, "Running in master mode\n"); dev->flags |= MLX4_FLAG_SRIOV | MLX4_FLAG_MASTER; dev->num_vfs = total_vfs; - sriov_initialized = 1; } } } @@ -2467,7 +2348,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) err = mlx4_reset(dev); if (err) { mlx4_err(dev, "Failed to reset HCA, aborting\n"); - goto err_rel_own; + goto err_sriov; } } @@ -2517,34 +2398,46 @@ slave_start: /* In master functions, the communication channel must be initialized * after obtaining its address from fw */ if (mlx4_is_master(dev)) { - unsigned sum = 0; - err = mlx4_multi_func_init(dev); - if (err) { - mlx4_err(dev, "Failed to init master mfunc interface, aborting\n"); + int ib_ports = 0; + + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + ib_ports++; + + if (ib_ports && + (num_vfs_argc > 1 || probe_vfs_argc > 1)) { + mlx4_err(dev, + "Invalid syntax of num_vfs/probe_vfs with IB port - single port VFs syntax is only supported when all ports are configured as ethernet\n"); + err = -EINVAL; + goto err_close; + } + if (dev->caps.num_ports < 2 && + num_vfs_argc > 1) { + err = -EINVAL; + mlx4_err(dev, + "Error: Trying to configure VFs on port 2, but HCA has only %d physical ports\n", + dev->caps.num_ports); goto err_close; } - if (sriov_initialized) { - int ib_ports = 0; - mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) - ib_ports++; + memcpy(dev->nvfs, nvfs, sizeof(dev->nvfs)); - if (ib_ports && - (num_vfs_argc > 1 || probe_vfs_argc > 1)) { - mlx4_err(dev, - "Invalid syntax of num_vfs/probe_vfs with IB port - single port VFs syntax is only supported when all ports are configured as ethernet\n"); - err = -EINVAL; - goto err_master_mfunc; - } - for (i = 0; i < sizeof(nvfs)/sizeof(nvfs[0]); i++) { - unsigned j; - for (j = 0; j < nvfs[i]; ++sum, ++j) { - dev->dev_vfs[sum].min_port = - i < 2 ? i + 1 : 1; - dev->dev_vfs[sum].n_ports = i < 2 ? 1 : - dev->caps.num_ports; - } + for (i = 0; i < sizeof(dev->nvfs)/sizeof(dev->nvfs[0]); i++) { + unsigned j; + + for (j = 0; j < dev->nvfs[i]; ++sum, ++j) { + dev->dev_vfs[sum].min_port = i < 2 ? i + 1 : 1; + dev->dev_vfs[sum].n_ports = i < 2 ? 1 : + dev->caps.num_ports; } } + + /* In master functions, the communication channel + * must be initialized after obtaining its address from fw + */ + err = mlx4_multi_func_init(dev); + if (err) { + mlx4_err(dev, "Failed to init master mfunc interface, aborting.\n"); + goto err_close; + } } err = mlx4_alloc_eq_table(dev); @@ -2565,7 +2458,7 @@ slave_start: if (!mlx4_is_slave(dev)) { err = mlx4_init_steering(dev); if (err) - goto err_free_eq; + goto err_disable_msix; } err = mlx4_setup_hca(dev); @@ -2625,6 +2518,10 @@ err_steer: if (!mlx4_is_slave(dev)) mlx4_clear_steering(dev); +err_disable_msix: + if (dev->flags & MLX4_FLAG_MSI_X) + pci_disable_msix(pdev); + err_free_eq: mlx4_free_eq_table(dev); @@ -2641,9 +2538,6 @@ err_master_mfunc: } err_close: - if (dev->flags & MLX4_FLAG_MSI_X) - pci_disable_msix(pdev); - mlx4_close_hca(dev); err_mfunc: @@ -2657,17 +2551,151 @@ err_sriov: if (dev->flags & MLX4_FLAG_SRIOV && !existing_vfs) pci_disable_sriov(pdev); -err_rel_own: - if (!mlx4_is_slave(dev)) - mlx4_free_ownership(dev); - if (mlx4_is_master(dev) && dev->num_vfs) atomic_dec(&pf_loading); kfree(priv->dev.dev_vfs); -err_free_dev: - kfree(priv); +err_free_own: + if (!mlx4_is_slave(dev)) + mlx4_free_ownership(dev); + + return err; +} + +static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data, + struct mlx4_priv *priv) +{ + int err; + int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0}; + int prb_vf[MLX4_MAX_PORTS + 1] = {0, 0, 0}; + const int param_map[MLX4_MAX_PORTS + 1][MLX4_MAX_PORTS + 1] = { + {2, 0, 0}, {0, 1, 2}, {0, 1, 2} }; + unsigned total_vfs = 0; + unsigned int i; + + pr_info(DRV_NAME ": Initializing %s\n", pci_name(pdev)); + + err = pci_enable_device(pdev); + if (err) { + dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n"); + return err; + } + + /* Due to requirement that all VFs and the PF are *guaranteed* 2 MACS + * per port, we must limit the number of VFs to 63 (since their are + * 128 MACs) + */ + for (i = 0; i < sizeof(nvfs)/sizeof(nvfs[0]) && i < num_vfs_argc; + total_vfs += nvfs[param_map[num_vfs_argc - 1][i]], i++) { + nvfs[param_map[num_vfs_argc - 1][i]] = num_vfs[i]; + if (nvfs[i] < 0) { + dev_err(&pdev->dev, "num_vfs module parameter cannot be negative\n"); + err = -EINVAL; + goto err_disable_pdev; + } + } + for (i = 0; i < sizeof(prb_vf)/sizeof(prb_vf[0]) && i < probe_vfs_argc; + i++) { + prb_vf[param_map[probe_vfs_argc - 1][i]] = probe_vf[i]; + if (prb_vf[i] < 0 || prb_vf[i] > nvfs[i]) { + dev_err(&pdev->dev, "probe_vf module parameter cannot be negative or greater than num_vfs\n"); + err = -EINVAL; + goto err_disable_pdev; + } + } + if (total_vfs >= MLX4_MAX_NUM_VF) { + dev_err(&pdev->dev, + "Requested more VF's (%d) than allowed (%d)\n", + total_vfs, MLX4_MAX_NUM_VF - 1); + err = -EINVAL; + goto err_disable_pdev; + } + + for (i = 0; i < MLX4_MAX_PORTS; i++) { + if (nvfs[i] + nvfs[2] >= MLX4_MAX_NUM_VF_P_PORT) { + dev_err(&pdev->dev, + "Requested more VF's (%d) for port (%d) than allowed (%d)\n", + nvfs[i] + nvfs[2], i + 1, + MLX4_MAX_NUM_VF_P_PORT - 1); + err = -EINVAL; + goto err_disable_pdev; + } + } + + /* Check for BARs. */ + if (!(pci_dev_data & MLX4_PCI_DEV_IS_VF) && + !(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) { + dev_err(&pdev->dev, "Missing DCS, aborting (driver_data: 0x%x, pci_resource_flags(pdev, 0):0x%lx)\n", + pci_dev_data, pci_resource_flags(pdev, 0)); + err = -ENODEV; + goto err_disable_pdev; + } + if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) { + dev_err(&pdev->dev, "Missing UAR, aborting\n"); + err = -ENODEV; + goto err_disable_pdev; + } + + err = pci_request_regions(pdev, DRV_NAME); + if (err) { + dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n"); + goto err_disable_pdev; + } + + pci_set_master(pdev); + + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask\n"); + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting\n"); + goto err_release_regions; + } + } + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, "Warning: couldn't set 64-bit consistent PCI DMA mask\n"); + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, aborting\n"); + goto err_release_regions; + } + } + + /* Allow large DMA segments, up to the firmware limit of 1 GB */ + dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024); + /* Detect if this device is a virtual function */ + if (pci_dev_data & MLX4_PCI_DEV_IS_VF) { + /* When acting as pf, we normally skip vfs unless explicitly + * requested to probe them. + */ + if (total_vfs) { + unsigned vfs_offset = 0; + + for (i = 0; i < sizeof(nvfs)/sizeof(nvfs[0]) && + vfs_offset + nvfs[i] < extended_func_num(pdev); + vfs_offset += nvfs[i], i++) + ; + if (i == sizeof(nvfs)/sizeof(nvfs[0])) { + err = -ENODEV; + goto err_release_regions; + } + if ((extended_func_num(pdev) - vfs_offset) + > prb_vf[i]) { + dev_warn(&pdev->dev, "Skipping virtual function:%d\n", + extended_func_num(pdev)); + err = -ENODEV; + goto err_release_regions; + } + } + } + + err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv); + if (err) + goto err_release_regions; + return 0; err_release_regions: pci_release_regions(pdev); @@ -2682,6 +2710,7 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) { struct mlx4_priv *priv; struct mlx4_dev *dev; + int ret; printk_once(KERN_INFO "%s", mlx4_version); @@ -2690,13 +2719,18 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) return -ENOMEM; dev = &priv->dev; + dev->pdev = pdev; pci_set_drvdata(pdev, dev); priv->pci_dev_data = id->driver_data; - return __mlx4_init_one(pdev, id->driver_data); + ret = __mlx4_init_one(pdev, id->driver_data, priv); + if (ret) + kfree(priv); + + return ret; } -static void __mlx4_remove_one(struct pci_dev *pdev) +static void mlx4_unload_one(struct pci_dev *pdev) { struct mlx4_dev *dev = pci_get_drvdata(pdev); struct mlx4_priv *priv = mlx4_priv(dev); @@ -2775,8 +2809,6 @@ static void __mlx4_remove_one(struct pci_dev *pdev) kfree(dev->caps.qp1_proxy); kfree(dev->dev_vfs); - pci_release_regions(pdev); - pci_disable_device(pdev); memset(priv, 0, sizeof(*priv)); priv->pci_dev_data = pci_dev_data; priv->removed = 1; @@ -2787,7 +2819,9 @@ static void mlx4_remove_one(struct pci_dev *pdev) struct mlx4_dev *dev = pci_get_drvdata(pdev); struct mlx4_priv *priv = mlx4_priv(dev); - __mlx4_remove_one(pdev); + mlx4_unload_one(pdev); + pci_release_regions(pdev); + pci_disable_device(pdev); kfree(priv); pci_set_drvdata(pdev, NULL); } @@ -2796,11 +2830,22 @@ int mlx4_restart_one(struct pci_dev *pdev) { struct mlx4_dev *dev = pci_get_drvdata(pdev); struct mlx4_priv *priv = mlx4_priv(dev); - int pci_dev_data; + int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0}; + int pci_dev_data, err, total_vfs; pci_dev_data = priv->pci_dev_data; - __mlx4_remove_one(pdev); - return __mlx4_init_one(pdev, pci_dev_data); + total_vfs = dev->num_vfs; + memcpy(nvfs, dev->nvfs, sizeof(dev->nvfs)); + + mlx4_unload_one(pdev); + err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv); + if (err) { + mlx4_err(dev, "%s: ERROR: mlx4_load_one failed, pci_name=%s, err=%d\n", + __func__, pci_name(pdev), err); + return err; + } + + return err; } static const struct pci_device_id mlx4_pci_table[] = { @@ -2854,7 +2899,7 @@ MODULE_DEVICE_TABLE(pci, mlx4_pci_table); static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state) { - __mlx4_remove_one(pdev); + mlx4_unload_one(pdev); return state == pci_channel_io_perm_failure ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET; @@ -2866,7 +2911,7 @@ static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev) struct mlx4_priv *priv = mlx4_priv(dev); int ret; - ret = __mlx4_init_one(pdev, priv->pci_dev_data); + ret = __mlx4_init_one(pdev, priv->pci_dev_data, priv); return ret ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; } @@ -2880,7 +2925,7 @@ static struct pci_driver mlx4_driver = { .name = DRV_NAME, .id_table = mlx4_pci_table, .probe = mlx4_init_one, - .shutdown = __mlx4_remove_one, + .shutdown = mlx4_unload_one, .remove = mlx4_remove_one, .err_handler = &mlx4_err_handler, }; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 03b5608a4329..b2f8ab9a57c4 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -707,6 +707,7 @@ struct mlx4_dev { u64 regid_promisc_array[MLX4_MAX_PORTS + 1]; u64 regid_allmulti_array[MLX4_MAX_PORTS + 1]; struct mlx4_vf_dev *dev_vfs; + int nvfs[MLX4_MAX_PORTS + 1]; }; struct mlx4_eqe { -- cgit v1.2.3-59-g8ed1b From d0bf4a9e92b9a93ffeeacbd7b6cb83e0ee3dc2ef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 29 Sep 2014 13:29:15 -0700 Subject: net: cleanup and document skb fclone layout Lets use a proper structure to clearly document and implement skb fast clones. Then, we might experiment more easily alternative layouts. This patch adds a new skb_fclone_busy() helper, used by tcp and xfrm, to stop leaking of implementation details. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 25 +++++++++++++++++++++++++ net/core/skbuff.c | 41 ++++++++++++++++++++--------------------- net/ipv4/tcp_output.c | 5 +---- net/xfrm/xfrm_policy.c | 4 +--- 4 files changed, 47 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 262efdbc346b..d8f7d74d5a4d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -781,6 +781,31 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, int *errcode, gfp_t gfp_mask); +/* Layout of fast clones : [skb1][skb2][fclone_ref] */ +struct sk_buff_fclones { + struct sk_buff skb1; + + struct sk_buff skb2; + + atomic_t fclone_ref; +}; + +/** + * skb_fclone_busy - check if fclone is busy + * @skb: buffer + * + * Returns true is skb is a fast clone, and its clone is not freed. + */ +static inline bool skb_fclone_busy(const struct sk_buff *skb) +{ + const struct sk_buff_fclones *fclones; + + fclones = container_of(skb, struct sk_buff_fclones, skb1); + + return skb->fclone == SKB_FCLONE_ORIG && + fclones->skb2.fclone == SKB_FCLONE_CLONE; +} + static inline struct sk_buff *alloc_skb_fclone(unsigned int size, gfp_t priority) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4be570a4ab21..a8cebb40699c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -257,15 +257,16 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, kmemcheck_annotate_variable(shinfo->destructor_arg); if (flags & SKB_ALLOC_FCLONE) { - struct sk_buff *child = skb + 1; - atomic_t *fclone_ref = (atomic_t *) (child + 1); + struct sk_buff_fclones *fclones; - kmemcheck_annotate_bitfield(child, flags1); + fclones = container_of(skb, struct sk_buff_fclones, skb1); + + kmemcheck_annotate_bitfield(&fclones->skb2, flags1); skb->fclone = SKB_FCLONE_ORIG; - atomic_set(fclone_ref, 1); + atomic_set(&fclones->fclone_ref, 1); - child->fclone = SKB_FCLONE_UNAVAILABLE; - child->pfmemalloc = pfmemalloc; + fclones->skb2.fclone = SKB_FCLONE_UNAVAILABLE; + fclones->skb2.pfmemalloc = pfmemalloc; } out: return skb; @@ -524,8 +525,7 @@ static void skb_release_data(struct sk_buff *skb) */ static void kfree_skbmem(struct sk_buff *skb) { - struct sk_buff *other; - atomic_t *fclone_ref; + struct sk_buff_fclones *fclones; switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: @@ -533,22 +533,21 @@ static void kfree_skbmem(struct sk_buff *skb) break; case SKB_FCLONE_ORIG: - fclone_ref = (atomic_t *) (skb + 2); - if (atomic_dec_and_test(fclone_ref)) - kmem_cache_free(skbuff_fclone_cache, skb); + fclones = container_of(skb, struct sk_buff_fclones, skb1); + if (atomic_dec_and_test(&fclones->fclone_ref)) + kmem_cache_free(skbuff_fclone_cache, fclones); break; case SKB_FCLONE_CLONE: - fclone_ref = (atomic_t *) (skb + 1); - other = skb - 1; + fclones = container_of(skb, struct sk_buff_fclones, skb2); /* The clone portion is available for * fast-cloning again. */ skb->fclone = SKB_FCLONE_UNAVAILABLE; - if (atomic_dec_and_test(fclone_ref)) - kmem_cache_free(skbuff_fclone_cache, other); + if (atomic_dec_and_test(&fclones->fclone_ref)) + kmem_cache_free(skbuff_fclone_cache, fclones); break; } } @@ -859,17 +858,18 @@ EXPORT_SYMBOL_GPL(skb_copy_ubufs); struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) { - struct sk_buff *n; + struct sk_buff_fclones *fclones = container_of(skb, + struct sk_buff_fclones, + skb1); + struct sk_buff *n = &fclones->skb2; if (skb_orphan_frags(skb, gfp_mask)) return NULL; - n = skb + 1; if (skb->fclone == SKB_FCLONE_ORIG && n->fclone == SKB_FCLONE_UNAVAILABLE) { - atomic_t *fclone_ref = (atomic_t *) (n + 1); n->fclone = SKB_FCLONE_CLONE; - atomic_inc(fclone_ref); + atomic_inc(&fclones->fclone_ref); } else { if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; @@ -3240,8 +3240,7 @@ void __init skb_init(void) SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", - (2*sizeof(struct sk_buff)) + - sizeof(atomic_t), + sizeof(struct sk_buff_fclones), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ee567e9e98c3..8d4eac793700 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2110,10 +2110,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) static bool skb_still_in_host_queue(const struct sock *sk, const struct sk_buff *skb) { - const struct sk_buff *fclone = skb + 1; - - if (unlikely(skb->fclone == SKB_FCLONE_ORIG && - fclone->fclone == SKB_FCLONE_CLONE)) { + if (unlikely(skb_fclone_busy(skb))) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); return true; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index f623dca6ce30..4c4e457e7888 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1961,10 +1961,8 @@ static int xdst_queue_output(struct sock *sk, struct sk_buff *skb) struct xfrm_dst *xdst = (struct xfrm_dst *) dst; struct xfrm_policy *pol = xdst->pols[0]; struct xfrm_policy_queue *pq = &pol->polq; - const struct sk_buff *fclone = skb + 1; - if (unlikely(skb->fclone == SKB_FCLONE_ORIG && - fclone->fclone == SKB_FCLONE_CLONE)) { + if (unlikely(skb_fclone_busy(skb))) { kfree_skb(skb); return 0; } -- cgit v1.2.3-59-g8ed1b From 8bce6d7d0d1ede22af334ee241841e9278365278 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 29 Sep 2014 20:22:29 -0700 Subject: udp: Generalize skb_udp_segment skb_udp_segment is the function called from udp4_ufo_fragment to segment a UDP tunnel packet. This function currently assumes segmentation is transparent Ethernet bridging (i.e. VXLAN encapsulation). This patch generalizes the function to operate on either Ethertype or IP protocol. The inner_protocol field must be set to the protocol of the inner header. This can now be either an Ethertype or an IP protocol (in a union). A new flag in the skbuff indicates which type is effective. skb_set_inner_protocol and skb_set_inner_ipproto helper functions were added to set the inner_protocol. These functions are called from the point where the tunnel encapsulation is occuring. When skb_udp_tunnel_segment is called, the function to segment the inner packet is selected based on the inner IP or Ethertype. In the case of an IP protocol encapsulation, the function is derived from inet[6]_offloads. In the case of Ethertype, skb->protocol is set to the inner_protocol and skb_mac_gso_segment is called. (GRE currently does this, but it might be possible to lookup the protocol in offload_base and call the appropriate segmenation function directly). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 26 +++++++++++++++++++++++-- include/net/udp.h | 3 ++- net/ipv4/udp_offload.c | 51 +++++++++++++++++++++++++++++++++++++++++++++----- net/ipv6/udp_offload.c | 2 +- 4 files changed, 73 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d8f7d74d5a4d..7c5036d11feb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -596,7 +596,8 @@ struct sk_buff { __u8 ndisc_nodetype:2; #endif __u8 ipvs_property:1; - /* 5 or 7 bit hole */ + __u8 inner_protocol_type:1; + /* 4 or 6 bit hole */ #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ @@ -632,7 +633,11 @@ struct sk_buff { __u32 reserved_tailroom; }; - __be16 inner_protocol; + union { + __be16 inner_protocol; + __u8 inner_ipproto; + }; + __u16 inner_transport_header; __u16 inner_network_header; __u16 inner_mac_header; @@ -1762,6 +1767,23 @@ static inline void skb_reserve(struct sk_buff *skb, int len) skb->tail += len; } +#define ENCAP_TYPE_ETHER 0 +#define ENCAP_TYPE_IPPROTO 1 + +static inline void skb_set_inner_protocol(struct sk_buff *skb, + __be16 protocol) +{ + skb->inner_protocol = protocol; + skb->inner_protocol_type = ENCAP_TYPE_ETHER; +} + +static inline void skb_set_inner_ipproto(struct sk_buff *skb, + __u8 ipproto) +{ + skb->inner_ipproto = ipproto; + skb->inner_protocol_type = ENCAP_TYPE_IPPROTO; +} + static inline void skb_reset_inner_headers(struct sk_buff *skb) { skb->inner_mac_header = skb->mac_header; diff --git a/include/net/udp.h b/include/net/udp.h index 16f4e80f0519..07f9b70962f6 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -239,7 +239,8 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); int udp_disconnect(struct sock *sk, int flags); unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait); struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, - netdev_features_t features); + netdev_features_t features, + bool is_ipv6); int udp_lib_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int udp_lib_setsockopt(struct sock *sk, int level, int optname, diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 19ebe6a39ddc..8c35f2c939ee 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -25,8 +25,11 @@ struct udp_offload_priv { struct udp_offload_priv __rcu *next; }; -struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, - netdev_features_t features) +static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, + netdev_features_t features, + struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb, + netdev_features_t features), + __be16 new_protocol) { struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; @@ -48,7 +51,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); - skb->protocol = htons(ETH_P_TEB); + skb->protocol = new_protocol; need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); if (need_csum) @@ -56,7 +59,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, /* segment inner packet. */ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); - segs = skb_mac_gso_segment(skb, enc_features); + segs = gso_inner_segment(skb, enc_features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, mac_len); @@ -101,6 +104,44 @@ out: return segs; } +struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, + netdev_features_t features, + bool is_ipv6) +{ + __be16 protocol = skb->protocol; + const struct net_offload **offloads; + const struct net_offload *ops; + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb, + netdev_features_t features); + + rcu_read_lock(); + + switch (skb->inner_protocol_type) { + case ENCAP_TYPE_ETHER: + protocol = skb->inner_protocol; + gso_inner_segment = skb_mac_gso_segment; + break; + case ENCAP_TYPE_IPPROTO: + offloads = is_ipv6 ? inet6_offloads : inet_offloads; + ops = rcu_dereference(offloads[skb->inner_ipproto]); + if (!ops || !ops->callbacks.gso_segment) + goto out_unlock; + gso_inner_segment = ops->callbacks.gso_segment; + break; + default: + goto out_unlock; + } + + segs = __skb_udp_tunnel_segment(skb, features, gso_inner_segment, + protocol); + +out_unlock: + rcu_read_unlock(); + + return segs; +} + static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, netdev_features_t features) { @@ -113,7 +154,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (skb->encapsulation && (skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { - segs = skb_udp_tunnel_segment(skb, features); + segs = skb_udp_tunnel_segment(skb, features, false); goto out; } diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 212ebfc7973f..8f96988c1db2 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -58,7 +58,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, if (skb->encapsulation && skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)) - segs = skb_udp_tunnel_segment(skb, features); + segs = skb_udp_tunnel_segment(skb, features, true); else { const struct ipv6hdr *ipv6h; struct udphdr *uh; -- cgit v1.2.3-59-g8ed1b From d068b02cfdfc27f5962ec82ec5568b706f599edc Mon Sep 17 00:00:00 2001 From: Petri Gynther Date: Wed, 1 Oct 2014 11:58:02 -0700 Subject: net: phy: add BCM7425 and BCM7429 PHYs Signed-off-by: Petri Gynther Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/bcm7xxx.c | 28 ++++++++++++++++++++++++++++ include/linux/brcmphy.h | 2 ++ 2 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c index daae69950925..1d211d369039 100644 --- a/drivers/net/phy/bcm7xxx.c +++ b/drivers/net/phy/bcm7xxx.c @@ -350,6 +350,32 @@ static struct phy_driver bcm7xxx_driver[] = { BCM7XXX_28NM_GPHY(PHY_ID_BCM7439, "Broadcom BCM7439"), BCM7XXX_28NM_GPHY(PHY_ID_BCM7445, "Broadcom BCM7445"), { + .phy_id = PHY_ID_BCM7425, + .phy_id_mask = 0xfffffff0, + .name = "Broadcom BCM7425", + .features = PHY_GBIT_FEATURES | + SUPPORTED_Pause | SUPPORTED_Asym_Pause, + .flags = 0, + .config_init = bcm7xxx_config_init, + .config_aneg = genphy_config_aneg, + .read_status = genphy_read_status, + .suspend = bcm7xxx_suspend, + .resume = bcm7xxx_config_init, + .driver = { .owner = THIS_MODULE }, +}, { + .phy_id = PHY_ID_BCM7429, + .phy_id_mask = 0xfffffff0, + .name = "Broadcom BCM7429", + .features = PHY_GBIT_FEATURES | + SUPPORTED_Pause | SUPPORTED_Asym_Pause, + .flags = PHY_IS_INTERNAL, + .config_init = bcm7xxx_config_init, + .config_aneg = genphy_config_aneg, + .read_status = genphy_read_status, + .suspend = bcm7xxx_suspend, + .resume = bcm7xxx_config_init, + .driver = { .owner = THIS_MODULE }, +}, { .phy_id = PHY_BCM_OUI_4, .phy_id_mask = 0xffff0000, .name = "Broadcom BCM7XXX 40nm", @@ -381,6 +407,8 @@ static struct mdio_device_id __maybe_unused bcm7xxx_tbl[] = { { PHY_ID_BCM7250, 0xfffffff0, }, { PHY_ID_BCM7364, 0xfffffff0, }, { PHY_ID_BCM7366, 0xfffffff0, }, + { PHY_ID_BCM7425, 0xfffffff0, }, + { PHY_ID_BCM7429, 0xfffffff0, }, { PHY_ID_BCM7439, 0xfffffff0, }, { PHY_ID_BCM7445, 0xfffffff0, }, { PHY_BCM_OUI_4, 0xffff0000 }, diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 3f626fe48c9a..7ccd928cc1f2 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -16,6 +16,8 @@ #define PHY_ID_BCM7250 0xae025280 #define PHY_ID_BCM7364 0xae025260 #define PHY_ID_BCM7366 0x600d8490 +#define PHY_ID_BCM7425 0x03625e60 +#define PHY_ID_BCM7429 0x600d8730 #define PHY_ID_BCM7439 0x600d8480 #define PHY_ID_BCM7445 0x600d8510 -- cgit v1.2.3-59-g8ed1b From 55a93b3ea780908b7d1b3a8cf1976223a9268d78 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Oct 2014 15:31:07 -0700 Subject: qdisc: validate skb without holding lock Validation of skb can be pretty expensive : GSO segmentation and/or checksum computations. We can do this without holding qdisc lock, so that other cpus can queue additional packets. Trick is that requeued packets were already validated, so we carry a boolean so that sch_direct_xmit() can validate a fresh skb list, or directly use an old one. Tested on 40Gb NIC (8 TX queues) and 200 concurrent flows, 48 threads host. Turning TSO on or off had no effect on throughput, only few more cpu cycles. Lock contention on qdisc lock disappeared. Same if disabling TX checksum offload. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- include/net/pkt_sched.h | 2 +- net/core/dev.c | 29 +++++++++++++++++++--- net/sched/sch_generic.c | 61 ++++++++++++++++++++++------------------------- 4 files changed, 56 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9b7fbacb6296..910fb17ad148 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2821,7 +2821,7 @@ int dev_set_mac_address(struct net_device *, struct sockaddr *); int dev_change_carrier(struct net_device *, bool new_carrier); int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_port_id *ppid); -struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev); +struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 8bbe626e9ece..e4b3c828c1c2 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -99,7 +99,7 @@ void qdisc_put_stab(struct qdisc_size_table *tab); void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc); int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, - spinlock_t *root_lock); + spinlock_t *root_lock, bool validate); void __qdisc_run(struct Qdisc *q); diff --git a/net/core/dev.c b/net/core/dev.c index e55c546717d4..1a90530f83ff 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2655,7 +2655,7 @@ struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t featur return skb; } -struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) +static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) { netdev_features_t features; @@ -2720,6 +2720,30 @@ out_null: return NULL; } +struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev) +{ + struct sk_buff *next, *head = NULL, *tail; + + while (skb) { + next = skb->next; + skb->next = NULL; + skb = validate_xmit_skb(skb, dev); + if (skb) { + struct sk_buff *end = skb; + + while (end->next) + end = end->next; + if (!head) + head = skb; + else + tail->next = skb; + tail = end; + } + skb = next; + } + return head; +} + static void qdisc_pkt_len_init(struct sk_buff *skb) { const struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -2786,8 +2810,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_bstats_update(q, skb); - skb = validate_xmit_skb(skb, dev); - if (skb && sch_direct_xmit(skb, q, dev, txq, root_lock)) { + if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { if (unlikely(contended)) { spin_unlock(&q->busylock); contended = false; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 797ebef73642..2b349a4de3c8 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -56,40 +56,34 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) return 0; } -static struct sk_buff *try_bulk_dequeue_skb(struct Qdisc *q, - struct sk_buff *head_skb, - int bytelimit) +static void try_bulk_dequeue_skb(struct Qdisc *q, + struct sk_buff *skb, + const struct netdev_queue *txq) { - struct sk_buff *skb, *tail_skb = head_skb; + int bytelimit = qdisc_avail_bulklimit(txq) - skb->len; while (bytelimit > 0) { - skb = q->dequeue(q); - if (!skb) - break; + struct sk_buff *nskb = q->dequeue(q); - bytelimit -= skb->len; /* covers GSO len */ - skb = validate_xmit_skb(skb, qdisc_dev(q)); - if (!skb) + if (!nskb) break; - while (tail_skb->next) /* GSO list goto tail */ - tail_skb = tail_skb->next; - - tail_skb->next = skb; - tail_skb = skb; + bytelimit -= nskb->len; /* covers GSO len */ + skb->next = nskb; + skb = nskb; } - - return head_skb; + skb->next = NULL; } /* Note that dequeue_skb can possibly return a SKB list (via skb->next). * A requeued skb (via q->gso_skb) can also be a SKB list. */ -static inline struct sk_buff *dequeue_skb(struct Qdisc *q) +static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate) { struct sk_buff *skb = q->gso_skb; const struct netdev_queue *txq = q->dev_queue; + *validate = true; if (unlikely(skb)) { /* check the reason of requeuing without tx lock first */ txq = skb_get_tx_queue(txq->dev, skb); @@ -98,21 +92,16 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q) q->q.qlen--; } else skb = NULL; + /* skb in gso_skb were already validated */ + *validate = false; } else { if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq)) { - int bytelimit = qdisc_avail_bulklimit(txq); - skb = q->dequeue(q); - if (skb) { - bytelimit -= skb->len; - skb = validate_xmit_skb(skb, qdisc_dev(q)); - } if (skb && qdisc_may_bulk(q)) - skb = try_bulk_dequeue_skb(q, skb, bytelimit); + try_bulk_dequeue_skb(q, skb, txq); } } - return skb; } @@ -156,19 +145,24 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb, */ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, - spinlock_t *root_lock) + spinlock_t *root_lock, bool validate) { int ret = NETDEV_TX_BUSY; /* And release qdisc */ spin_unlock(root_lock); - HARD_TX_LOCK(dev, txq, smp_processor_id()); - if (!netif_xmit_frozen_or_stopped(txq)) - skb = dev_hard_start_xmit(skb, dev, txq, &ret); + /* Note that we validate skb (GSO, checksum, ...) outside of locks */ + if (validate) + skb = validate_xmit_skb_list(skb, dev); - HARD_TX_UNLOCK(dev, txq); + if (skb) { + HARD_TX_LOCK(dev, txq, smp_processor_id()); + if (!netif_xmit_frozen_or_stopped(txq)) + skb = dev_hard_start_xmit(skb, dev, txq, &ret); + HARD_TX_UNLOCK(dev, txq); + } spin_lock(root_lock); if (dev_xmit_complete(ret)) { @@ -217,9 +211,10 @@ static inline int qdisc_restart(struct Qdisc *q) struct net_device *dev; spinlock_t *root_lock; struct sk_buff *skb; + bool validate; /* Dequeue packet */ - skb = dequeue_skb(q); + skb = dequeue_skb(q, &validate); if (unlikely(!skb)) return 0; @@ -229,7 +224,7 @@ static inline int qdisc_restart(struct Qdisc *q) dev = qdisc_dev(q); txq = skb_get_tx_queue(dev, skb); - return sch_direct_xmit(skb, q, dev, txq, root_lock); + return sch_direct_xmit(skb, q, dev, txq, root_lock, validate); } void __qdisc_run(struct Qdisc *q) -- cgit v1.2.3-59-g8ed1b From c7a08ac7ee68b9af0d5af99c7b34b574cac4d144 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 2 Oct 2014 12:19:42 +0300 Subject: net/mlx5_core: Update device capabilities handling Rearrange struct mlx5_caps so it has a "gen" field to represent the current capabilities configured for the device. Max capabilities can also be queried from the device. Also update capabilities struct to contain more fields as per the latest revision if firmware specification. Signed-off-by: Eli Cohen Signed-off-by: David S. Miller --- drivers/infiniband/hw/mlx5/cq.c | 8 +- drivers/infiniband/hw/mlx5/mad.c | 2 +- drivers/infiniband/hw/mlx5/main.c | 83 ++++++---- drivers/infiniband/hw/mlx5/qp.c | 72 +++++--- drivers/infiniband/hw/mlx5/srq.c | 6 +- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 24 ++- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/fw.c | 81 +-------- drivers/net/ethernet/mellanox/mlx5/core/main.c | 219 +++++++++++++++++++------ drivers/net/ethernet/mellanox/mlx5/core/uar.c | 4 +- include/linux/mlx5/device.h | 24 ++- include/linux/mlx5/driver.h | 28 +++- 12 files changed, 331 insertions(+), 222 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index e4056279166d..10cfce5119a9 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -752,7 +752,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries, return ERR_PTR(-EINVAL); entries = roundup_pow_of_two(entries + 1); - if (entries > dev->mdev->caps.max_cqes) + if (entries > dev->mdev->caps.gen.max_cqes) return ERR_PTR(-EINVAL); cq = kzalloc(sizeof(*cq), GFP_KERNEL); @@ -919,7 +919,7 @@ int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) int err; u32 fsel; - if (!(dev->mdev->caps.flags & MLX5_DEV_CAP_FLAG_CQ_MODER)) + if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_CQ_MODER)) return -ENOSYS; in = kzalloc(sizeof(*in), GFP_KERNEL); @@ -1074,7 +1074,7 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) int uninitialized_var(cqe_size); unsigned long flags; - if (!(dev->mdev->caps.flags & MLX5_DEV_CAP_FLAG_RESIZE_CQ)) { + if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_RESIZE_CQ)) { pr_info("Firmware does not support resize CQ\n"); return -ENOSYS; } @@ -1083,7 +1083,7 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) return -EINVAL; entries = roundup_pow_of_two(entries + 1); - if (entries > dev->mdev->caps.max_cqes + 1) + if (entries > dev->mdev->caps.gen.max_cqes + 1) return -EINVAL; if (entries == ibcq->cqe + 1) diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index b514bbb5610f..657af9a1167c 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -129,7 +129,7 @@ int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port) packet_error = be16_to_cpu(out_mad->status); - dev->mdev->caps.ext_port_cap[port - 1] = (!err && !packet_error) ? + dev->mdev->caps.gen.ext_port_cap[port - 1] = (!err && !packet_error) ? MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO : 0; out: diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index d8907b20522a..f3114d1132fb 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -157,11 +157,13 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, struct mlx5_ib_dev *dev = to_mdev(ibdev); struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; + struct mlx5_general_caps *gen; int err = -ENOMEM; int max_rq_sg; int max_sq_sg; u64 flags; + gen = &dev->mdev->caps.gen; in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); if (!in_mad || !out_mad) @@ -183,7 +185,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN; - flags = dev->mdev->caps.flags; + flags = gen->flags; if (flags & MLX5_DEV_CAP_FLAG_BAD_PKEY_CNTR) props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; if (flags & MLX5_DEV_CAP_FLAG_BAD_QKEY_CNTR) @@ -213,30 +215,31 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, memcpy(&props->sys_image_guid, out_mad->data + 4, 8); props->max_mr_size = ~0ull; - props->page_size_cap = dev->mdev->caps.min_page_sz; - props->max_qp = 1 << dev->mdev->caps.log_max_qp; - props->max_qp_wr = dev->mdev->caps.max_wqes; - max_rq_sg = dev->mdev->caps.max_rq_desc_sz / sizeof(struct mlx5_wqe_data_seg); - max_sq_sg = (dev->mdev->caps.max_sq_desc_sz - sizeof(struct mlx5_wqe_ctrl_seg)) / + props->page_size_cap = gen->min_page_sz; + props->max_qp = 1 << gen->log_max_qp; + props->max_qp_wr = gen->max_wqes; + max_rq_sg = gen->max_rq_desc_sz / sizeof(struct mlx5_wqe_data_seg); + max_sq_sg = (gen->max_sq_desc_sz - sizeof(struct mlx5_wqe_ctrl_seg)) / sizeof(struct mlx5_wqe_data_seg); props->max_sge = min(max_rq_sg, max_sq_sg); - props->max_cq = 1 << dev->mdev->caps.log_max_cq; - props->max_cqe = dev->mdev->caps.max_cqes - 1; - props->max_mr = 1 << dev->mdev->caps.log_max_mkey; - props->max_pd = 1 << dev->mdev->caps.log_max_pd; - props->max_qp_rd_atom = dev->mdev->caps.max_ra_req_qp; - props->max_qp_init_rd_atom = dev->mdev->caps.max_ra_res_qp; + props->max_cq = 1 << gen->log_max_cq; + props->max_cqe = gen->max_cqes - 1; + props->max_mr = 1 << gen->log_max_mkey; + props->max_pd = 1 << gen->log_max_pd; + props->max_qp_rd_atom = 1 << gen->log_max_ra_req_qp; + props->max_qp_init_rd_atom = 1 << gen->log_max_ra_res_qp; + props->max_srq = 1 << gen->log_max_srq; + props->max_srq_wr = gen->max_srq_wqes - 1; + props->local_ca_ack_delay = gen->local_ca_ack_delay; props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; - props->max_srq = 1 << dev->mdev->caps.log_max_srq; - props->max_srq_wr = dev->mdev->caps.max_srq_wqes - 1; props->max_srq_sge = max_rq_sg - 1; props->max_fast_reg_page_list_len = (unsigned int)-1; - props->local_ca_ack_delay = dev->mdev->caps.local_ca_ack_delay; + props->local_ca_ack_delay = gen->local_ca_ack_delay; props->atomic_cap = IB_ATOMIC_NONE; props->masked_atomic_cap = IB_ATOMIC_NONE; props->max_pkeys = be16_to_cpup((__be16 *)(out_mad->data + 28)); - props->max_mcast_grp = 1 << dev->mdev->caps.log_max_mcg; - props->max_mcast_qp_attach = dev->mdev->caps.max_qp_mcg; + props->max_mcast_grp = 1 << gen->log_max_mcg; + props->max_mcast_qp_attach = gen->max_qp_mcg; props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ @@ -254,10 +257,12 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, struct mlx5_ib_dev *dev = to_mdev(ibdev); struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; + struct mlx5_general_caps *gen; int ext_active_speed; int err = -ENOMEM; - if (port < 1 || port > dev->mdev->caps.num_ports) { + gen = &dev->mdev->caps.gen; + if (port < 1 || port > gen->num_ports) { mlx5_ib_warn(dev, "invalid port number %d\n", port); return -EINVAL; } @@ -288,8 +293,8 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, props->phys_state = out_mad->data[33] >> 4; props->port_cap_flags = be32_to_cpup((__be32 *)(out_mad->data + 20)); props->gid_tbl_len = out_mad->data[50]; - props->max_msg_sz = 1 << to_mdev(ibdev)->mdev->caps.log_max_msg; - props->pkey_tbl_len = to_mdev(ibdev)->mdev->caps.port[port - 1].pkey_table_len; + props->max_msg_sz = 1 << gen->log_max_msg; + props->pkey_tbl_len = gen->port[port - 1].pkey_table_len; props->bad_pkey_cntr = be16_to_cpup((__be16 *)(out_mad->data + 46)); props->qkey_viol_cntr = be16_to_cpup((__be16 *)(out_mad->data + 48)); props->active_width = out_mad->data[31] & 0xf; @@ -316,7 +321,7 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, /* If reported active speed is QDR, check if is FDR-10 */ if (props->active_speed == 4) { - if (dev->mdev->caps.ext_port_cap[port - 1] & + if (gen->ext_port_cap[port - 1] & MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO) { init_query_mad(in_mad); in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO; @@ -470,6 +475,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, struct mlx5_ib_alloc_ucontext_req_v2 req; struct mlx5_ib_alloc_ucontext_resp resp; struct mlx5_ib_ucontext *context; + struct mlx5_general_caps *gen; struct mlx5_uuar_info *uuari; struct mlx5_uar *uars; int gross_uuars; @@ -480,6 +486,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, int i; size_t reqlen; + gen = &dev->mdev->caps.gen; if (!dev->ib_active) return ERR_PTR(-EAGAIN); @@ -512,14 +519,14 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE; gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE; - resp.qp_tab_size = 1 << dev->mdev->caps.log_max_qp; - resp.bf_reg_size = dev->mdev->caps.bf_reg_size; + resp.qp_tab_size = 1 << gen->log_max_qp; + resp.bf_reg_size = gen->bf_reg_size; resp.cache_line_size = L1_CACHE_BYTES; - resp.max_sq_desc_sz = dev->mdev->caps.max_sq_desc_sz; - resp.max_rq_desc_sz = dev->mdev->caps.max_rq_desc_sz; - resp.max_send_wqebb = dev->mdev->caps.max_wqes; - resp.max_recv_wr = dev->mdev->caps.max_wqes; - resp.max_srq_recv_wr = dev->mdev->caps.max_srq_wqes; + resp.max_sq_desc_sz = gen->max_sq_desc_sz; + resp.max_rq_desc_sz = gen->max_rq_desc_sz; + resp.max_send_wqebb = gen->max_wqes; + resp.max_recv_wr = gen->max_wqes; + resp.max_srq_recv_wr = gen->max_srq_wqes; context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) @@ -565,7 +572,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, mutex_init(&context->db_page_mutex); resp.tot_uuars = req.total_num_uuars; - resp.num_ports = dev->mdev->caps.num_ports; + resp.num_ports = gen->num_ports; err = ib_copy_to_udata(udata, &resp, sizeof(resp) - sizeof(resp.reserved)); if (err) @@ -967,9 +974,11 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, static void get_ext_port_caps(struct mlx5_ib_dev *dev) { + struct mlx5_general_caps *gen; int port; - for (port = 1; port <= dev->mdev->caps.num_ports; port++) + gen = &dev->mdev->caps.gen; + for (port = 1; port <= gen->num_ports; port++) mlx5_query_ext_port_caps(dev, port); } @@ -977,9 +986,11 @@ static int get_port_caps(struct mlx5_ib_dev *dev) { struct ib_device_attr *dprops = NULL; struct ib_port_attr *pprops = NULL; + struct mlx5_general_caps *gen; int err = 0; int port; + gen = &dev->mdev->caps.gen; pprops = kmalloc(sizeof(*pprops), GFP_KERNEL); if (!pprops) goto out; @@ -994,14 +1005,14 @@ static int get_port_caps(struct mlx5_ib_dev *dev) goto out; } - for (port = 1; port <= dev->mdev->caps.num_ports; port++) { + for (port = 1; port <= gen->num_ports; port++) { err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); if (err) { mlx5_ib_warn(dev, "query_port %d failed %d\n", port, err); break; } - dev->mdev->caps.port[port - 1].pkey_table_len = dprops->max_pkeys; - dev->mdev->caps.port[port - 1].gid_table_len = pprops->gid_tbl_len; + gen->port[port - 1].pkey_table_len = dprops->max_pkeys; + gen->port[port - 1].gid_table_len = pprops->gid_tbl_len; mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n", dprops->max_pkeys, pprops->gid_tbl_len); } @@ -1279,8 +1290,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX); dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.node_type = RDMA_NODE_IB_CA; - dev->ib_dev.local_dma_lkey = mdev->caps.reserved_lkey; - dev->num_ports = mdev->caps.num_ports; + dev->ib_dev.local_dma_lkey = mdev->caps.gen.reserved_lkey; + dev->num_ports = mdev->caps.gen.num_ports; dev->ib_dev.phys_port_cnt = dev->num_ports; dev->ib_dev.num_comp_vectors = dev->num_comp_vectors; dev->ib_dev.dma_device = &mdev->pdev->dev; @@ -1355,7 +1366,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list; dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; - if (mdev->caps.flags & MLX5_DEV_CAP_FLAG_XRC) { + if (mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_XRC) { dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; dev->ib_dev.uverbs_cmd_mask |= diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 8c574b63d77b..dbfe498870c1 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -158,11 +158,13 @@ static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, int has_rq, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd) { + struct mlx5_general_caps *gen; int wqe_size; int wq_size; + gen = &dev->mdev->caps.gen; /* Sanity check RQ size before proceeding */ - if (cap->max_recv_wr > dev->mdev->caps.max_wqes) + if (cap->max_recv_wr > gen->max_wqes) return -EINVAL; if (!has_rq) { @@ -182,10 +184,10 @@ static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, wq_size = roundup_pow_of_two(cap->max_recv_wr) * wqe_size; wq_size = max_t(int, wq_size, MLX5_SEND_WQE_BB); qp->rq.wqe_cnt = wq_size / wqe_size; - if (wqe_size > dev->mdev->caps.max_rq_desc_sz) { + if (wqe_size > gen->max_rq_desc_sz) { mlx5_ib_dbg(dev, "wqe_size %d, max %d\n", wqe_size, - dev->mdev->caps.max_rq_desc_sz); + gen->max_rq_desc_sz); return -EINVAL; } qp->rq.wqe_shift = ilog2(wqe_size); @@ -266,9 +268,11 @@ static int calc_send_wqe(struct ib_qp_init_attr *attr) static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, struct mlx5_ib_qp *qp) { + struct mlx5_general_caps *gen; int wqe_size; int wq_size; + gen = &dev->mdev->caps.gen; if (!attr->cap.max_send_wr) return 0; @@ -277,9 +281,9 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, if (wqe_size < 0) return wqe_size; - if (wqe_size > dev->mdev->caps.max_sq_desc_sz) { + if (wqe_size > gen->max_sq_desc_sz) { mlx5_ib_dbg(dev, "wqe_size(%d) > max_sq_desc_sz(%d)\n", - wqe_size, dev->mdev->caps.max_sq_desc_sz); + wqe_size, gen->max_sq_desc_sz); return -EINVAL; } @@ -292,9 +296,9 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size); qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB; - if (qp->sq.wqe_cnt > dev->mdev->caps.max_wqes) { + if (qp->sq.wqe_cnt > gen->max_wqes) { mlx5_ib_dbg(dev, "wqe count(%d) exceeds limits(%d)\n", - qp->sq.wqe_cnt, dev->mdev->caps.max_wqes); + qp->sq.wqe_cnt, gen->max_wqes); return -ENOMEM; } qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); @@ -309,11 +313,13 @@ static int set_user_buf_size(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd) { + struct mlx5_general_caps *gen; int desc_sz = 1 << qp->sq.wqe_shift; - if (desc_sz > dev->mdev->caps.max_sq_desc_sz) { + gen = &dev->mdev->caps.gen; + if (desc_sz > gen->max_sq_desc_sz) { mlx5_ib_warn(dev, "desc_sz %d, max_sq_desc_sz %d\n", - desc_sz, dev->mdev->caps.max_sq_desc_sz); + desc_sz, gen->max_sq_desc_sz); return -EINVAL; } @@ -325,9 +331,9 @@ static int set_user_buf_size(struct mlx5_ib_dev *dev, qp->sq.wqe_cnt = ucmd->sq_wqe_count; - if (qp->sq.wqe_cnt > dev->mdev->caps.max_wqes) { + if (qp->sq.wqe_cnt > gen->max_wqes) { mlx5_ib_warn(dev, "wqe_cnt %d, max_wqes %d\n", - qp->sq.wqe_cnt, dev->mdev->caps.max_wqes); + qp->sq.wqe_cnt, gen->max_wqes); return -EINVAL; } @@ -803,16 +809,18 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct mlx5_ib_resources *devr = &dev->devr; struct mlx5_ib_create_qp_resp resp; struct mlx5_create_qp_mbox_in *in; + struct mlx5_general_caps *gen; struct mlx5_ib_create_qp ucmd; int inlen = sizeof(*in); int err; + gen = &dev->mdev->caps.gen; mutex_init(&qp->mutex); spin_lock_init(&qp->sq.lock); spin_lock_init(&qp->rq.lock); if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { - if (!(dev->mdev->caps.flags & MLX5_DEV_CAP_FLAG_BLOCK_MCAST)) { + if (!(gen->flags & MLX5_DEV_CAP_FLAG_BLOCK_MCAST)) { mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n"); return -EINVAL; } else { @@ -851,9 +859,9 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, mlx5_ib_dbg(dev, "invalid rq params\n"); return -EINVAL; } - if (ucmd.sq_wqe_count > dev->mdev->caps.max_wqes) { + if (ucmd.sq_wqe_count > gen->max_wqes) { mlx5_ib_dbg(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n", - ucmd.sq_wqe_count, dev->mdev->caps.max_wqes); + ucmd.sq_wqe_count, gen->max_wqes); return -EINVAL; } err = create_user_qp(dev, pd, qp, udata, &in, &resp, &inlen); @@ -1144,6 +1152,7 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata) { + struct mlx5_general_caps *gen; struct mlx5_ib_dev *dev; struct mlx5_ib_qp *qp; u16 xrcdn = 0; @@ -1161,11 +1170,12 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, } dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device); } + gen = &dev->mdev->caps.gen; switch (init_attr->qp_type) { case IB_QPT_XRC_TGT: case IB_QPT_XRC_INI: - if (!(dev->mdev->caps.flags & MLX5_DEV_CAP_FLAG_XRC)) { + if (!(gen->flags & MLX5_DEV_CAP_FLAG_XRC)) { mlx5_ib_dbg(dev, "XRC not supported\n"); return ERR_PTR(-ENOSYS); } @@ -1272,6 +1282,9 @@ enum { static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) { + struct mlx5_general_caps *gen; + + gen = &dev->mdev->caps.gen; if (rate == IB_RATE_PORT_CURRENT) { return 0; } else if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_300_GBPS) { @@ -1279,7 +1292,7 @@ static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) } else { while (rate != IB_RATE_2_5_GBPS && !(1 << (rate + MLX5_STAT_RATE_OFFSET) & - dev->mdev->caps.stat_rate_support)) + gen->stat_rate_support)) --rate; } @@ -1290,8 +1303,10 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah, struct mlx5_qp_path *path, u8 port, int attr_mask, u32 path_flags, const struct ib_qp_attr *attr) { + struct mlx5_general_caps *gen; int err; + gen = &dev->mdev->caps.gen; path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0; path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 : 0; @@ -1318,9 +1333,9 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah, path->port = port; if (ah->ah_flags & IB_AH_GRH) { - if (ah->grh.sgid_index >= dev->mdev->caps.port[port - 1].gid_table_len) { + if (ah->grh.sgid_index >= gen->port[port - 1].gid_table_len) { pr_err(KERN_ERR "sgid_index (%u) too large. max is %d\n", - ah->grh.sgid_index, dev->mdev->caps.port[port - 1].gid_table_len); + ah->grh.sgid_index, gen->port[port - 1].gid_table_len); return -EINVAL; } @@ -1492,6 +1507,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, struct mlx5_ib_qp *qp = to_mqp(ibqp); struct mlx5_ib_cq *send_cq, *recv_cq; struct mlx5_qp_context *context; + struct mlx5_general_caps *gen; struct mlx5_modify_qp_mbox_in *in; struct mlx5_ib_pd *pd; enum mlx5_qp_state mlx5_cur, mlx5_new; @@ -1500,6 +1516,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, int mlx5_st; int err; + gen = &dev->mdev->caps.gen; in = kzalloc(sizeof(*in), GFP_KERNEL); if (!in) return -ENOMEM; @@ -1539,7 +1556,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, err = -EINVAL; goto out; } - context->mtu_msgmax = (attr->path_mtu << 5) | dev->mdev->caps.log_max_msg; + context->mtu_msgmax = (attr->path_mtu << 5) | gen->log_max_msg; } if (attr_mask & IB_QP_DEST_QPN) @@ -1685,9 +1702,11 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_ib_qp *qp = to_mqp(ibqp); enum ib_qp_state cur_state, new_state; + struct mlx5_general_caps *gen; int err = -EINVAL; int port; + gen = &dev->mdev->caps.gen; mutex_lock(&qp->mutex); cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; @@ -1699,21 +1718,21 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, goto out; if ((attr_mask & IB_QP_PORT) && - (attr->port_num == 0 || attr->port_num > dev->mdev->caps.num_ports)) + (attr->port_num == 0 || attr->port_num > gen->num_ports)) goto out; if (attr_mask & IB_QP_PKEY_INDEX) { port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; - if (attr->pkey_index >= dev->mdev->caps.port[port - 1].pkey_table_len) + if (attr->pkey_index >= gen->port[port - 1].pkey_table_len) goto out; } if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && - attr->max_rd_atomic > dev->mdev->caps.max_ra_res_qp) + attr->max_rd_atomic > (1 << gen->log_max_ra_res_qp)) goto out; if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && - attr->max_dest_rd_atomic > dev->mdev->caps.max_ra_req_qp) + attr->max_dest_rd_atomic > (1 << gen->log_max_ra_req_qp)) goto out; if (cur_state == new_state && cur_state == IB_QPS_RESET) { @@ -2893,7 +2912,8 @@ static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_at memset(ib_ah_attr, 0, sizeof(*ib_ah_attr)); ib_ah_attr->port_num = path->port; - if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports) + if (ib_ah_attr->port_num == 0 || + ib_ah_attr->port_num > dev->caps.gen.num_ports) return; ib_ah_attr->sl = path->sl & 0xf; @@ -3011,10 +3031,12 @@ struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_general_caps *gen; struct mlx5_ib_xrcd *xrcd; int err; - if (!(dev->mdev->caps.flags & MLX5_DEV_CAP_FLAG_XRC)) + gen = &dev->mdev->caps.gen; + if (!(gen->flags & MLX5_DEV_CAP_FLAG_XRC)) return ERR_PTR(-ENOSYS); xrcd = kmalloc(sizeof(*xrcd), GFP_KERNEL); diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 70bd131ba646..97cc1baaa8e3 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -238,6 +238,7 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_general_caps *gen; struct mlx5_ib_srq *srq; int desc_size; int buf_size; @@ -247,11 +248,12 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, int is_xrc; u32 flgs, xrcdn; + gen = &dev->mdev->caps.gen; /* Sanity check SRQ size before proceeding */ - if (init_attr->attr.max_wr >= dev->mdev->caps.max_srq_wqes) { + if (init_attr->attr.max_wr >= gen->max_srq_wqes) { mlx5_ib_dbg(dev, "max_wr %d, cap %d\n", init_attr->attr.max_wr, - dev->mdev->caps.max_srq_wqes); + gen->max_srq_wqes); return ERR_PTR(-EINVAL); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 65a7da69e2ac..6eb0f85cf872 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -1538,16 +1538,9 @@ static const char *cmd_status_str(u8 status) } } -int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr) +static int cmd_status_to_err(u8 status) { - if (!hdr->status) - return 0; - - pr_warn("command failed, status %s(0x%x), syndrome 0x%x\n", - cmd_status_str(hdr->status), hdr->status, - be32_to_cpu(hdr->syndrome)); - - switch (hdr->status) { + switch (status) { case MLX5_CMD_STAT_OK: return 0; case MLX5_CMD_STAT_INT_ERR: return -EIO; case MLX5_CMD_STAT_BAD_OP_ERR: return -EINVAL; @@ -1567,3 +1560,16 @@ int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr) default: return -EIO; } } + +/* this will be available till all the commands use set/get macros */ +int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr) +{ + if (!hdr->status) + return 0; + + pr_warn("command failed, status %s(0x%x), syndrome 0x%x\n", + cmd_status_str(hdr->status), hdr->status, + be32_to_cpu(hdr->syndrome)); + + return cmd_status_to_err(hdr->status); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 4e8bd0b34bb0..11b9b840ad4d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -468,7 +468,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev) err = mlx5_create_map_eq(dev, &table->pages_eq, MLX5_EQ_VEC_PAGES, - dev->caps.max_vf + 1, + dev->caps.gen.max_vf + 1, 1 << MLX5_EVENT_TYPE_PAGE_REQUEST, "mlx5_pages_eq", &dev->priv.uuari.uars[0]); if (err) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index f012658b6a92..087c4c797deb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -64,86 +64,9 @@ out_out: return err; } -int mlx5_cmd_query_hca_cap(struct mlx5_core_dev *dev, - struct mlx5_caps *caps) +int mlx5_cmd_query_hca_cap(struct mlx5_core_dev *dev, struct mlx5_caps *caps) { - struct mlx5_cmd_query_hca_cap_mbox_out *out; - struct mlx5_cmd_query_hca_cap_mbox_in in; - struct mlx5_query_special_ctxs_mbox_out ctx_out; - struct mlx5_query_special_ctxs_mbox_in ctx_in; - int err; - u16 t16; - - out = kzalloc(sizeof(*out), GFP_KERNEL); - if (!out) - return -ENOMEM; - - memset(&in, 0, sizeof(in)); - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_HCA_CAP); - in.hdr.opmod = cpu_to_be16(0x1); - err = mlx5_cmd_exec(dev, &in, sizeof(in), out, sizeof(*out)); - if (err) - goto out_out; - - if (out->hdr.status) { - err = mlx5_cmd_status_to_err(&out->hdr); - goto out_out; - } - - - caps->log_max_eq = out->hca_cap.log_max_eq & 0xf; - caps->max_cqes = 1 << out->hca_cap.log_max_cq_sz; - caps->max_wqes = 1 << out->hca_cap.log_max_qp_sz; - caps->max_sq_desc_sz = be16_to_cpu(out->hca_cap.max_desc_sz_sq); - caps->max_rq_desc_sz = be16_to_cpu(out->hca_cap.max_desc_sz_rq); - caps->flags = be64_to_cpu(out->hca_cap.flags); - caps->stat_rate_support = be16_to_cpu(out->hca_cap.stat_rate_support); - caps->log_max_msg = out->hca_cap.log_max_msg & 0x1f; - caps->num_ports = out->hca_cap.num_ports & 0xf; - caps->log_max_cq = out->hca_cap.log_max_cq & 0x1f; - if (caps->num_ports > MLX5_MAX_PORTS) { - mlx5_core_err(dev, "device has %d ports while the driver supports max %d ports\n", - caps->num_ports, MLX5_MAX_PORTS); - err = -EINVAL; - goto out_out; - } - caps->log_max_qp = out->hca_cap.log_max_qp & 0x1f; - caps->log_max_mkey = out->hca_cap.log_max_mkey & 0x3f; - caps->log_max_pd = out->hca_cap.log_max_pd & 0x1f; - caps->log_max_srq = out->hca_cap.log_max_srqs & 0x1f; - caps->local_ca_ack_delay = out->hca_cap.local_ca_ack_delay & 0x1f; - caps->log_max_mcg = out->hca_cap.log_max_mcg; - caps->max_qp_mcg = be32_to_cpu(out->hca_cap.max_qp_mcg) & 0xffffff; - caps->max_ra_res_qp = 1 << (out->hca_cap.log_max_ra_res_qp & 0x3f); - caps->max_ra_req_qp = 1 << (out->hca_cap.log_max_ra_req_qp & 0x3f); - caps->max_srq_wqes = 1 << out->hca_cap.log_max_srq_sz; - t16 = be16_to_cpu(out->hca_cap.bf_log_bf_reg_size); - if (t16 & 0x8000) { - caps->bf_reg_size = 1 << (t16 & 0x1f); - caps->bf_regs_per_page = MLX5_BF_REGS_PER_PAGE; - } else { - caps->bf_reg_size = 0; - caps->bf_regs_per_page = 0; - } - caps->min_page_sz = ~(u32)((1 << out->hca_cap.log_pg_sz) - 1); - - memset(&ctx_in, 0, sizeof(ctx_in)); - memset(&ctx_out, 0, sizeof(ctx_out)); - ctx_in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); - err = mlx5_cmd_exec(dev, &ctx_in, sizeof(ctx_in), - &ctx_out, sizeof(ctx_out)); - if (err) - goto out_out; - - if (ctx_out.hdr.status) - err = mlx5_cmd_status_to_err(&ctx_out.hdr); - - caps->reserved_lkey = be32_to_cpu(ctx_out.reserved_lkey); - -out_out: - kfree(out); - - return err; + return mlx5_core_get_caps(dev, caps, HCA_CAP_OPMOD_GET_CUR); } int mlx5_cmd_init_hca(struct mlx5_core_dev *dev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index f2716cc1f51d..d9f74618befa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -207,11 +207,11 @@ static void release_bar(struct pci_dev *pdev) static int mlx5_enable_msix(struct mlx5_core_dev *dev) { struct mlx5_eq_table *table = &dev->priv.eq_table; - int num_eqs = 1 << dev->caps.log_max_eq; + int num_eqs = 1 << dev->caps.gen.log_max_eq; int nvec; int i; - nvec = dev->caps.num_ports * num_online_cpus() + MLX5_EQ_VEC_COMP_BASE; + nvec = dev->caps.gen.num_ports * num_online_cpus() + MLX5_EQ_VEC_COMP_BASE; nvec = min_t(int, nvec, num_eqs); if (nvec <= MLX5_EQ_VEC_COMP_BASE) return -ENOMEM; @@ -250,13 +250,34 @@ struct mlx5_reg_host_endianess { #define CAP_MASK(pos, size) ((u64)((1 << (size)) - 1) << (pos)) enum { - MLX5_CAP_BITS_RW_MASK = CAP_MASK(MLX5_CAP_OFF_CMDIF_CSUM, 2) | - CAP_MASK(MLX5_CAP_OFF_DCT, 1), + MLX5_CAP_BITS_RW_MASK = CAP_MASK(MLX5_CAP_OFF_CMDIF_CSUM, 2) | + MLX5_DEV_CAP_FLAG_DCT, }; +static u16 to_fw_pkey_sz(u32 size) +{ + switch (size) { + case 128: + return 0; + case 256: + return 1; + case 512: + return 2; + case 1024: + return 3; + case 2048: + return 4; + case 4096: + return 5; + default: + pr_warn("invalid pkey table size %d\n", size); + return 0; + } +} + /* selectively copy writable fields clearing any reserved area */ -static void copy_rw_fields(struct mlx5_hca_cap *to, struct mlx5_hca_cap *from) +static void copy_rw_fields(struct mlx5_hca_cap *to, struct mlx5_general_caps *from) { u64 v64; @@ -265,76 +286,172 @@ static void copy_rw_fields(struct mlx5_hca_cap *to, struct mlx5_hca_cap *from) to->log_max_ra_res_dc = from->log_max_ra_res_dc & 0x3f; to->log_max_ra_req_qp = from->log_max_ra_req_qp & 0x3f; to->log_max_ra_res_qp = from->log_max_ra_res_qp & 0x3f; - to->log_max_atomic_size_qp = from->log_max_atomic_size_qp; - to->log_max_atomic_size_dc = from->log_max_atomic_size_dc; - v64 = be64_to_cpu(from->flags) & MLX5_CAP_BITS_RW_MASK; + to->pkey_table_size = cpu_to_be16(to_fw_pkey_sz(from->pkey_table_size)); + v64 = from->flags & MLX5_CAP_BITS_RW_MASK; to->flags = cpu_to_be64(v64); } -enum { - HCA_CAP_OPMOD_GET_MAX = 0, - HCA_CAP_OPMOD_GET_CUR = 1, -}; +static u16 get_pkey_table_size(int pkey) +{ + if (pkey > MLX5_MAX_LOG_PKEY_TABLE) + return 0; -static int handle_hca_cap(struct mlx5_core_dev *dev) + return MLX5_MIN_PKEY_TABLE_SIZE << pkey; +} + +static void fw2drv_caps(struct mlx5_caps *caps, + struct mlx5_cmd_query_hca_cap_mbox_out *out) { - struct mlx5_cmd_query_hca_cap_mbox_out *query_out = NULL; - struct mlx5_cmd_set_hca_cap_mbox_in *set_ctx = NULL; - struct mlx5_cmd_query_hca_cap_mbox_in query_ctx; - struct mlx5_cmd_set_hca_cap_mbox_out set_out; - u64 flags; + struct mlx5_general_caps *gen = &caps->gen; + u16 t16; + + gen->max_srq_wqes = 1 << out->hca_cap.log_max_srq_sz; + gen->max_wqes = 1 << out->hca_cap.log_max_qp_sz; + gen->log_max_qp = out->hca_cap.log_max_qp & 0x1f; + gen->log_max_strq = out->hca_cap.log_max_strq_sz; + gen->log_max_srq = out->hca_cap.log_max_srqs & 0x1f; + gen->max_cqes = 1 << out->hca_cap.log_max_cq_sz; + gen->log_max_cq = out->hca_cap.log_max_cq & 0x1f; + gen->max_eqes = out->hca_cap.log_max_eq_sz; + gen->log_max_mkey = out->hca_cap.log_max_mkey & 0x3f; + gen->log_max_eq = out->hca_cap.log_max_eq & 0xf; + gen->max_indirection = out->hca_cap.max_indirection; + gen->log_max_mrw_sz = out->hca_cap.log_max_mrw_sz; + gen->log_max_bsf_list_size = 0; + gen->log_max_klm_list_size = 0; + gen->log_max_ra_req_dc = out->hca_cap.log_max_ra_req_dc; + gen->log_max_ra_res_dc = out->hca_cap.log_max_ra_res_dc; + gen->log_max_ra_req_qp = out->hca_cap.log_max_ra_req_qp; + gen->log_max_ra_res_qp = out->hca_cap.log_max_ra_res_qp; + gen->max_qp_counters = be16_to_cpu(out->hca_cap.max_qp_count); + gen->pkey_table_size = get_pkey_table_size(be16_to_cpu(out->hca_cap.pkey_table_size)); + gen->local_ca_ack_delay = out->hca_cap.local_ca_ack_delay & 0x1f; + gen->num_ports = out->hca_cap.num_ports & 0xf; + gen->log_max_msg = out->hca_cap.log_max_msg & 0x1f; + gen->stat_rate_support = be16_to_cpu(out->hca_cap.stat_rate_support); + gen->flags = be64_to_cpu(out->hca_cap.flags); + pr_debug("flags = 0x%llx\n", gen->flags); + gen->uar_sz = out->hca_cap.uar_sz; + gen->min_log_pg_sz = out->hca_cap.log_pg_sz; + + t16 = be16_to_cpu(out->hca_cap.bf_log_bf_reg_size); + if (t16 & 0x8000) { + gen->bf_reg_size = 1 << (t16 & 0x1f); + gen->bf_regs_per_page = MLX5_BF_REGS_PER_PAGE; + } else { + gen->bf_reg_size = 0; + gen->bf_regs_per_page = 0; + } + gen->max_sq_desc_sz = be16_to_cpu(out->hca_cap.max_desc_sz_sq); + gen->max_rq_desc_sz = be16_to_cpu(out->hca_cap.max_desc_sz_rq); + gen->max_qp_mcg = be32_to_cpu(out->hca_cap.max_qp_mcg) & 0xffffff; + gen->log_max_pd = out->hca_cap.log_max_pd & 0x1f; + gen->log_max_xrcd = out->hca_cap.log_max_xrcd; + gen->log_uar_page_sz = be16_to_cpu(out->hca_cap.log_uar_page_sz); +} + +static const char *caps_opmod_str(u16 opmod) +{ + switch (opmod) { + case HCA_CAP_OPMOD_GET_MAX: + return "GET_MAX"; + case HCA_CAP_OPMOD_GET_CUR: + return "GET_CUR"; + default: + return "Invalid"; + } +} + +int mlx5_core_get_caps(struct mlx5_core_dev *dev, struct mlx5_caps *caps, + u16 opmod) +{ + struct mlx5_cmd_query_hca_cap_mbox_out *out; + struct mlx5_cmd_query_hca_cap_mbox_in in; int err; - memset(&query_ctx, 0, sizeof(query_ctx)); - query_out = kzalloc(sizeof(*query_out), GFP_KERNEL); - if (!query_out) + memset(&in, 0, sizeof(in)); + out = kzalloc(sizeof(*out), GFP_KERNEL); + if (!out) return -ENOMEM; - set_ctx = kzalloc(sizeof(*set_ctx), GFP_KERNEL); - if (!set_ctx) { - err = -ENOMEM; + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_HCA_CAP); + in.hdr.opmod = cpu_to_be16(opmod); + err = mlx5_cmd_exec(dev, &in, sizeof(in), out, sizeof(*out)); + + err = mlx5_cmd_status_to_err(&out->hdr); + if (err) { + mlx5_core_warn(dev, "query max hca cap failed, %d\n", err); goto query_ex; } + mlx5_core_dbg(dev, "%s\n", caps_opmod_str(opmod)); + fw2drv_caps(caps, out); - query_ctx.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_HCA_CAP); - query_ctx.hdr.opmod = cpu_to_be16(HCA_CAP_OPMOD_GET_CUR); - err = mlx5_cmd_exec(dev, &query_ctx, sizeof(query_ctx), - query_out, sizeof(*query_out)); +query_ex: + kfree(out); + return err; +} + +static int set_caps(struct mlx5_core_dev *dev, + struct mlx5_cmd_set_hca_cap_mbox_in *in) +{ + struct mlx5_cmd_set_hca_cap_mbox_out out; + int err; + + memset(&out, 0, sizeof(out)); + + in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_SET_HCA_CAP); + err = mlx5_cmd_exec(dev, in, sizeof(*in), &out, sizeof(out)); if (err) - goto query_ex; + return err; - err = mlx5_cmd_status_to_err(&query_out->hdr); - if (err) { - mlx5_core_warn(dev, "query hca cap failed, %d\n", err); + err = mlx5_cmd_status_to_err(&out.hdr); + + return err; +} + +static int handle_hca_cap(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd_set_hca_cap_mbox_in *set_ctx = NULL; + struct mlx5_profile *prof = dev->profile; + struct mlx5_caps *cur_caps = NULL; + struct mlx5_caps *max_caps = NULL; + int err = -ENOMEM; + + set_ctx = kzalloc(sizeof(*set_ctx), GFP_KERNEL); + if (!set_ctx) goto query_ex; - } - copy_rw_fields(&set_ctx->hca_cap, &query_out->hca_cap); + max_caps = kzalloc(sizeof(*max_caps), GFP_KERNEL); + if (!max_caps) + goto query_ex; - if (dev->profile && dev->profile->mask & MLX5_PROF_MASK_QP_SIZE) - set_ctx->hca_cap.log_max_qp = dev->profile->log_max_qp; + cur_caps = kzalloc(sizeof(*cur_caps), GFP_KERNEL); + if (!cur_caps) + goto query_ex; - flags = be64_to_cpu(query_out->hca_cap.flags); - /* disable checksum */ - flags &= ~MLX5_DEV_CAP_FLAG_CMDIF_CSUM; - - set_ctx->hca_cap.flags = cpu_to_be64(flags); - memset(&set_out, 0, sizeof(set_out)); - set_ctx->hca_cap.log_uar_page_sz = cpu_to_be16(PAGE_SHIFT - 12); - set_ctx->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_SET_HCA_CAP); - err = mlx5_cmd_exec(dev, set_ctx, sizeof(*set_ctx), - &set_out, sizeof(set_out)); - if (err) { - mlx5_core_warn(dev, "set hca cap failed, %d\n", err); + err = mlx5_core_get_caps(dev, max_caps, HCA_CAP_OPMOD_GET_MAX); + if (err) goto query_ex; - } - err = mlx5_cmd_status_to_err(&set_out.hdr); + err = mlx5_core_get_caps(dev, cur_caps, HCA_CAP_OPMOD_GET_CUR); if (err) goto query_ex; + /* we limit the size of the pkey table to 128 entries for now */ + cur_caps->gen.pkey_table_size = 128; + + if (prof->mask & MLX5_PROF_MASK_QP_SIZE) + cur_caps->gen.log_max_qp = prof->log_max_qp; + + /* disable checksum */ + cur_caps->gen.flags &= ~MLX5_DEV_CAP_FLAG_CMDIF_CSUM; + + copy_rw_fields(&set_ctx->hca_cap, &cur_caps->gen); + err = set_caps(dev, set_ctx); + query_ex: - kfree(query_out); + kfree(cur_caps); + kfree(max_caps); kfree(set_ctx); return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c index 68f5d9c77c7b..0a6348cefc01 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/uar.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c @@ -174,11 +174,11 @@ int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari) for (i = 0; i < tot_uuars; i++) { bf = &uuari->bfs[i]; - bf->buf_size = dev->caps.bf_reg_size / 2; + bf->buf_size = dev->caps.gen.bf_reg_size / 2; bf->uar = &uuari->uars[i / MLX5_BF_REGS_PER_PAGE]; bf->regreg = uuari->uars[i / MLX5_BF_REGS_PER_PAGE].map; bf->reg = NULL; /* Add WC support */ - bf->offset = (i % MLX5_BF_REGS_PER_PAGE) * dev->caps.bf_reg_size + + bf->offset = (i % MLX5_BF_REGS_PER_PAGE) * dev->caps.gen.bf_reg_size + MLX5_BF_OFFSET; bf->need_lock = need_uuar_lock(i); spin_lock_init(&bf->lock); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 334947151dfc..dce01fd854a8 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -70,6 +70,11 @@ enum { MLX5_INLINE_SEG = 0x80000000, }; +enum { + MLX5_MIN_PKEY_TABLE_SIZE = 128, + MLX5_MAX_LOG_PKEY_TABLE = 5, +}; + enum { MLX5_PERM_LOCAL_READ = 1 << 2, MLX5_PERM_LOCAL_WRITE = 1 << 3, @@ -184,10 +189,10 @@ enum { MLX5_DEV_CAP_FLAG_CQ_MODER = 1LL << 29, MLX5_DEV_CAP_FLAG_RESIZE_CQ = 1LL << 30, MLX5_DEV_CAP_FLAG_RESIZE_SRQ = 1LL << 32, + MLX5_DEV_CAP_FLAG_DCT = 1LL << 37, MLX5_DEV_CAP_FLAG_REMOTE_FENCE = 1LL << 38, MLX5_DEV_CAP_FLAG_TLP_HINTS = 1LL << 39, MLX5_DEV_CAP_FLAG_SIG_HAND_OVER = 1LL << 40, - MLX5_DEV_CAP_FLAG_DCT = 1LL << 41, MLX5_DEV_CAP_FLAG_CMDIF_CSUM = 3LL << 46, }; @@ -243,10 +248,14 @@ enum { }; enum { - MLX5_CAP_OFF_DCT = 41, MLX5_CAP_OFF_CMDIF_CSUM = 46, }; +enum { + HCA_CAP_OPMOD_GET_MAX = 0, + HCA_CAP_OPMOD_GET_CUR = 1, +}; + struct mlx5_inbox_hdr { __be16 opcode; u8 rsvd[4]; @@ -303,9 +312,10 @@ struct mlx5_hca_cap { u8 log_max_ra_req_qp; u8 rsvd10; u8 log_max_ra_res_qp; - u8 rsvd11[4]; + u8 pad_cap; + u8 rsvd11[3]; __be16 max_qp_count; - __be16 rsvd12; + __be16 pkey_table_size; u8 rsvd13; u8 local_ca_ack_delay; u8 rsvd14; @@ -335,11 +345,7 @@ struct mlx5_hca_cap { u8 log_max_xrcd; u8 rsvd25[42]; __be16 log_uar_page_sz; - u8 rsvd26[28]; - u8 log_max_atomic_size_qp; - u8 rsvd27[2]; - u8 log_max_atomic_size_dc; - u8 rsvd28[76]; + u8 rsvd26[108]; }; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index b88e9b46d957..45a2add747e0 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -335,23 +335,30 @@ struct mlx5_port_caps { int pkey_table_len; }; -struct mlx5_caps { +struct mlx5_general_caps { u8 log_max_eq; u8 log_max_cq; u8 log_max_qp; u8 log_max_mkey; u8 log_max_pd; u8 log_max_srq; + u8 log_max_strq; + u8 log_max_mrw_sz; + u8 log_max_bsf_list_size; + u8 log_max_klm_list_size; u32 max_cqes; int max_wqes; + u32 max_eqes; + u32 max_indirection; int max_sq_desc_sz; int max_rq_desc_sz; + int max_dc_sq_desc_sz; u64 flags; u16 stat_rate_support; int log_max_msg; int num_ports; - int max_ra_res_qp; - int max_ra_req_qp; + u8 log_max_ra_res_qp; + u8 log_max_ra_req_qp; int max_srq_wqes; int bf_reg_size; int bf_regs_per_page; @@ -363,6 +370,19 @@ struct mlx5_caps { u8 log_max_mcg; u32 max_qp_mcg; int min_page_sz; + int pd_cap; + u32 max_qp_counters; + u32 pkey_table_size; + u8 log_max_ra_req_dc; + u8 log_max_ra_res_dc; + u32 uar_sz; + u8 min_log_pg_sz; + u8 log_max_xrcd; + u16 log_uar_page_sz; +}; + +struct mlx5_caps { + struct mlx5_general_caps gen; }; struct mlx5_cmd_mailbox { @@ -695,6 +715,8 @@ void mlx5_cmd_cleanup(struct mlx5_core_dev *dev); void mlx5_cmd_use_events(struct mlx5_core_dev *dev); void mlx5_cmd_use_polling(struct mlx5_core_dev *dev); int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr); +int mlx5_core_get_caps(struct mlx5_core_dev *dev, struct mlx5_caps *caps, + u16 opmod); int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size, -- cgit v1.2.3-59-g8ed1b From d29b796adada8780db3512c4a34b339f9aeef1ae Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 2 Oct 2014 12:19:43 +0300 Subject: net/mlx5_core: Use hardware registers description header file Add an auto generated header file that describes hardware registers along with set of macros that set/get values. The macros do static checks to avoid overflow, handle endianess, and overall provide a clean way to code commands. Currently the header file is small and we will add structs as we make use of the macros. A few commands were removed from the commands enum since they are not supported currently and will be added when support is available. Signed-off-by: Eli Cohen Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 36 ------- drivers/net/ethernet/mellanox/mlx5/core/qp.c | 3 - include/linux/mlx5/device.h | 44 ++++++++ include/linux/mlx5/driver.h | 76 +------------- include/linux/mlx5/mlx5_ifc.h | 143 ++++++++++++++++++++++++++ 5 files changed, 188 insertions(+), 114 deletions(-) create mode 100644 include/linux/mlx5/mlx5_ifc.h (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 6eb0f85cf872..3ecef1310bae 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -357,60 +357,24 @@ const char *mlx5_command_str(int command) case MLX5_CMD_OP_2ERR_QP: return "2ERR_QP"; - case MLX5_CMD_OP_RTS2SQD_QP: - return "RTS2SQD_QP"; - - case MLX5_CMD_OP_SQD2RTS_QP: - return "SQD2RTS_QP"; - case MLX5_CMD_OP_2RST_QP: return "2RST_QP"; case MLX5_CMD_OP_QUERY_QP: return "QUERY_QP"; - case MLX5_CMD_OP_CONF_SQP: - return "CONF_SQP"; - case MLX5_CMD_OP_MAD_IFC: return "MAD_IFC"; case MLX5_CMD_OP_INIT2INIT_QP: return "INIT2INIT_QP"; - case MLX5_CMD_OP_SUSPEND_QP: - return "SUSPEND_QP"; - - case MLX5_CMD_OP_UNSUSPEND_QP: - return "UNSUSPEND_QP"; - - case MLX5_CMD_OP_SQD2SQD_QP: - return "SQD2SQD_QP"; - - case MLX5_CMD_OP_ALLOC_QP_COUNTER_SET: - return "ALLOC_QP_COUNTER_SET"; - - case MLX5_CMD_OP_DEALLOC_QP_COUNTER_SET: - return "DEALLOC_QP_COUNTER_SET"; - - case MLX5_CMD_OP_QUERY_QP_COUNTER_SET: - return "QUERY_QP_COUNTER_SET"; - case MLX5_CMD_OP_CREATE_PSV: return "CREATE_PSV"; case MLX5_CMD_OP_DESTROY_PSV: return "DESTROY_PSV"; - case MLX5_CMD_OP_QUERY_PSV: - return "QUERY_PSV"; - - case MLX5_CMD_OP_QUERY_SIG_RULE_TABLE: - return "QUERY_SIG_RULE_TABLE"; - - case MLX5_CMD_OP_QUERY_BLOCK_SIZE_TABLE: - return "QUERY_BLOCK_SIZE_TABLE"; - case MLX5_CMD_OP_CREATE_SRQ: return "CREATE_SRQ"; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index 8145b4668229..415b67ce379e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c @@ -184,13 +184,10 @@ int mlx5_core_qp_modify(struct mlx5_core_dev *dev, enum mlx5_qp_state cur_state, [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTS2RTS_QP, - [MLX5_QP_STATE_SQD] = MLX5_CMD_OP_RTS2SQD_QP, }, [MLX5_QP_STATE_SQD] = { [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, - [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_SQD2RTS_QP, - [MLX5_QP_STATE_SQD] = MLX5_CMD_OP_SQD2SQD_QP, }, [MLX5_QP_STATE_SQER] = { [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index dce01fd854a8..0032687f58c7 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -44,6 +44,50 @@ #error Host endianness not defined #endif +/* helper macros */ +#define __mlx5_nullp(typ) ((struct mlx5_ifc_##typ##_bits *)0) +#define __mlx5_bit_sz(typ, fld) sizeof(__mlx5_nullp(typ)->fld) +#define __mlx5_bit_off(typ, fld) ((unsigned)(unsigned long)(&(__mlx5_nullp(typ)->fld))) +#define __mlx5_dw_off(typ, fld) (__mlx5_bit_off(typ, fld) / 32) +#define __mlx5_64_off(typ, fld) (__mlx5_bit_off(typ, fld) / 64) +#define __mlx5_dw_bit_off(typ, fld) (32 - __mlx5_bit_sz(typ, fld) - (__mlx5_bit_off(typ, fld) & 0x1f)) +#define __mlx5_mask(typ, fld) ((u32)((1ull << __mlx5_bit_sz(typ, fld)) - 1)) +#define __mlx5_dw_mask(typ, fld) (__mlx5_mask(typ, fld) << __mlx5_dw_bit_off(typ, fld)) +#define __mlx5_st_sz_bits(typ) sizeof(struct mlx5_ifc_##typ##_bits) + +#define MLX5_FLD_SZ_BYTES(typ, fld) (__mlx5_bit_sz(typ, fld) / 8) +#define MLX5_ST_SZ_BYTES(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 8) +#define MLX5_ST_SZ_DW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 32) +#define MLX5_BYTE_OFF(typ, fld) (__mlx5_bit_off(typ, fld) / 8) +#define MLX5_ADDR_OF(typ, p, fld) ((char *)(p) + MLX5_BYTE_OFF(typ, fld)) + +/* insert a value to a struct */ +#define MLX5_SET(typ, p, fld, v) do { \ + BUILD_BUG_ON(__mlx5_st_sz_bits(typ) % 32); \ + *((__be32 *)(p) + __mlx5_dw_off(typ, fld)) = \ + cpu_to_be32((be32_to_cpu(*((__be32 *)(p) + __mlx5_dw_off(typ, fld))) & \ + (~__mlx5_dw_mask(typ, fld))) | (((v) & __mlx5_mask(typ, fld)) \ + << __mlx5_dw_bit_off(typ, fld))); \ +} while (0) + +#define MLX5_GET(typ, p, fld) ((be32_to_cpu(*((__be32 *)(p) +\ +__mlx5_dw_off(typ, fld))) >> __mlx5_dw_bit_off(typ, fld)) & \ +__mlx5_mask(typ, fld)) + +#define MLX5_GET_PR(typ, p, fld) ({ \ + u32 ___t = MLX5_GET(typ, p, fld); \ + pr_debug(#fld " = 0x%x\n", ___t); \ + ___t; \ +}) + +#define MLX5_SET64(typ, p, fld, v) do { \ + BUILD_BUG_ON(__mlx5_bit_sz(typ, fld) != 64); \ + BUILD_BUG_ON(__mlx5_bit_off(typ, fld) % 64); \ + *((__be64 *)(p) + __mlx5_64_off(typ, fld)) = cpu_to_be64(v); \ +} while (0) + +#define MLX5_GET64(typ, p, fld) be64_to_cpu(*((__be64 *)(p) + __mlx5_64_off(typ, fld))) + enum { MLX5_MAX_COMMANDS = 32, MLX5_CMD_DATA_BLOCK_SIZE = 512, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 45a2add747e0..6f48dc793b9f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -44,6 +44,7 @@ #include #include +#include enum { MLX5_BOARD_ID_LEN = 64, @@ -98,81 +99,6 @@ enum { MLX5_ATOMIC_MODE_256B = 8 << 16, }; -enum { - MLX5_CMD_OP_QUERY_HCA_CAP = 0x100, - MLX5_CMD_OP_QUERY_ADAPTER = 0x101, - MLX5_CMD_OP_INIT_HCA = 0x102, - MLX5_CMD_OP_TEARDOWN_HCA = 0x103, - MLX5_CMD_OP_ENABLE_HCA = 0x104, - MLX5_CMD_OP_DISABLE_HCA = 0x105, - MLX5_CMD_OP_QUERY_PAGES = 0x107, - MLX5_CMD_OP_MANAGE_PAGES = 0x108, - MLX5_CMD_OP_SET_HCA_CAP = 0x109, - - MLX5_CMD_OP_CREATE_MKEY = 0x200, - MLX5_CMD_OP_QUERY_MKEY = 0x201, - MLX5_CMD_OP_DESTROY_MKEY = 0x202, - MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS = 0x203, - - MLX5_CMD_OP_CREATE_EQ = 0x301, - MLX5_CMD_OP_DESTROY_EQ = 0x302, - MLX5_CMD_OP_QUERY_EQ = 0x303, - - MLX5_CMD_OP_CREATE_CQ = 0x400, - MLX5_CMD_OP_DESTROY_CQ = 0x401, - MLX5_CMD_OP_QUERY_CQ = 0x402, - MLX5_CMD_OP_MODIFY_CQ = 0x403, - - MLX5_CMD_OP_CREATE_QP = 0x500, - MLX5_CMD_OP_DESTROY_QP = 0x501, - MLX5_CMD_OP_RST2INIT_QP = 0x502, - MLX5_CMD_OP_INIT2RTR_QP = 0x503, - MLX5_CMD_OP_RTR2RTS_QP = 0x504, - MLX5_CMD_OP_RTS2RTS_QP = 0x505, - MLX5_CMD_OP_SQERR2RTS_QP = 0x506, - MLX5_CMD_OP_2ERR_QP = 0x507, - MLX5_CMD_OP_RTS2SQD_QP = 0x508, - MLX5_CMD_OP_SQD2RTS_QP = 0x509, - MLX5_CMD_OP_2RST_QP = 0x50a, - MLX5_CMD_OP_QUERY_QP = 0x50b, - MLX5_CMD_OP_CONF_SQP = 0x50c, - MLX5_CMD_OP_MAD_IFC = 0x50d, - MLX5_CMD_OP_INIT2INIT_QP = 0x50e, - MLX5_CMD_OP_SUSPEND_QP = 0x50f, - MLX5_CMD_OP_UNSUSPEND_QP = 0x510, - MLX5_CMD_OP_SQD2SQD_QP = 0x511, - MLX5_CMD_OP_ALLOC_QP_COUNTER_SET = 0x512, - MLX5_CMD_OP_DEALLOC_QP_COUNTER_SET = 0x513, - MLX5_CMD_OP_QUERY_QP_COUNTER_SET = 0x514, - - MLX5_CMD_OP_CREATE_PSV = 0x600, - MLX5_CMD_OP_DESTROY_PSV = 0x601, - MLX5_CMD_OP_QUERY_PSV = 0x602, - MLX5_CMD_OP_QUERY_SIG_RULE_TABLE = 0x603, - MLX5_CMD_OP_QUERY_BLOCK_SIZE_TABLE = 0x604, - - MLX5_CMD_OP_CREATE_SRQ = 0x700, - MLX5_CMD_OP_DESTROY_SRQ = 0x701, - MLX5_CMD_OP_QUERY_SRQ = 0x702, - MLX5_CMD_OP_ARM_RQ = 0x703, - MLX5_CMD_OP_RESIZE_SRQ = 0x704, - - MLX5_CMD_OP_ALLOC_PD = 0x800, - MLX5_CMD_OP_DEALLOC_PD = 0x801, - MLX5_CMD_OP_ALLOC_UAR = 0x802, - MLX5_CMD_OP_DEALLOC_UAR = 0x803, - - MLX5_CMD_OP_ATTACH_TO_MCG = 0x806, - MLX5_CMD_OP_DETACH_FROM_MCG = 0x807, - - - MLX5_CMD_OP_ALLOC_XRCD = 0x80e, - MLX5_CMD_OP_DEALLOC_XRCD = 0x80f, - - MLX5_CMD_OP_ACCESS_REG = 0x805, - MLX5_CMD_OP_MAX = 0x810, -}; - enum { MLX5_REG_PCAP = 0x5001, MLX5_REG_PMTU = 0x5003, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h new file mode 100644 index 000000000000..df3bd9b5fbcf --- /dev/null +++ b/include/linux/mlx5/mlx5_ifc.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2014, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_IFC_H +#define MLX5_IFC_H + +enum { + MLX5_CMD_OP_QUERY_HCA_CAP = 0x100, + MLX5_CMD_OP_QUERY_ADAPTER = 0x101, + MLX5_CMD_OP_INIT_HCA = 0x102, + MLX5_CMD_OP_TEARDOWN_HCA = 0x103, + MLX5_CMD_OP_ENABLE_HCA = 0x104, + MLX5_CMD_OP_DISABLE_HCA = 0x105, + MLX5_CMD_OP_QUERY_PAGES = 0x107, + MLX5_CMD_OP_MANAGE_PAGES = 0x108, + MLX5_CMD_OP_SET_HCA_CAP = 0x109, + MLX5_CMD_OP_CREATE_MKEY = 0x200, + MLX5_CMD_OP_QUERY_MKEY = 0x201, + MLX5_CMD_OP_DESTROY_MKEY = 0x202, + MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS = 0x203, + MLX5_CMD_OP_PAGE_FAULT_RESUME = 0x204, + MLX5_CMD_OP_CREATE_EQ = 0x301, + MLX5_CMD_OP_DESTROY_EQ = 0x302, + MLX5_CMD_OP_QUERY_EQ = 0x303, + MLX5_CMD_OP_GEN_EQE = 0x304, + MLX5_CMD_OP_CREATE_CQ = 0x400, + MLX5_CMD_OP_DESTROY_CQ = 0x401, + MLX5_CMD_OP_QUERY_CQ = 0x402, + MLX5_CMD_OP_MODIFY_CQ = 0x403, + MLX5_CMD_OP_CREATE_QP = 0x500, + MLX5_CMD_OP_DESTROY_QP = 0x501, + MLX5_CMD_OP_RST2INIT_QP = 0x502, + MLX5_CMD_OP_INIT2RTR_QP = 0x503, + MLX5_CMD_OP_RTR2RTS_QP = 0x504, + MLX5_CMD_OP_RTS2RTS_QP = 0x505, + MLX5_CMD_OP_SQERR2RTS_QP = 0x506, + MLX5_CMD_OP_2ERR_QP = 0x507, + MLX5_CMD_OP_2RST_QP = 0x50a, + MLX5_CMD_OP_QUERY_QP = 0x50b, + MLX5_CMD_OP_INIT2INIT_QP = 0x50e, + MLX5_CMD_OP_CREATE_PSV = 0x600, + MLX5_CMD_OP_DESTROY_PSV = 0x601, + MLX5_CMD_OP_CREATE_SRQ = 0x700, + MLX5_CMD_OP_DESTROY_SRQ = 0x701, + MLX5_CMD_OP_QUERY_SRQ = 0x702, + MLX5_CMD_OP_ARM_RQ = 0x703, + MLX5_CMD_OP_RESIZE_SRQ = 0x704, + MLX5_CMD_OP_CREATE_DCT = 0x710, + MLX5_CMD_OP_DESTROY_DCT = 0x711, + MLX5_CMD_OP_DRAIN_DCT = 0x712, + MLX5_CMD_OP_QUERY_DCT = 0x713, + MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION = 0x714, + MLX5_CMD_OP_QUERY_VPORT_STATE = 0x750, + MLX5_CMD_OP_MODIFY_VPORT_STATE = 0x751, + MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT = 0x752, + MLX5_CMD_OP_MODIFY_ESW_VPORT_CONTEXT = 0x753, + MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT = 0x754, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT = 0x755, + MLX5_CMD_OP_QUERY_RCOE_ADDRESS = 0x760, + MLX5_CMD_OP_SET_ROCE_ADDRESS = 0x761, + MLX5_CMD_OP_QUERY_VPORT_COUNTER = 0x770, + MLX5_CMD_OP_ALLOC_Q_COUNTER = 0x771, + MLX5_CMD_OP_DEALLOC_Q_COUNTER = 0x772, + MLX5_CMD_OP_QUERY_Q_COUNTER = 0x773, + MLX5_CMD_OP_ALLOC_PD = 0x800, + MLX5_CMD_OP_DEALLOC_PD = 0x801, + MLX5_CMD_OP_ALLOC_UAR = 0x802, + MLX5_CMD_OP_DEALLOC_UAR = 0x803, + MLX5_CMD_OP_CONFIG_INT_MODERATION = 0x804, + MLX5_CMD_OP_ACCESS_REG = 0x805, + MLX5_CMD_OP_ATTACH_TO_MCG = 0x806, + MLX5_CMD_OP_DETACH_FROM_MCG = 0x807, + MLX5_CMD_OP_GET_DROPPED_PACKET_LOG = 0x80a, + MLX5_CMD_OP_MAD_IFC = 0x50d, + MLX5_CMD_OP_QUERY_MAD_DEMUX = 0x80b, + MLX5_CMD_OP_SET_MAD_DEMUX = 0x80c, + MLX5_CMD_OP_NOP = 0x80d, + MLX5_CMD_OP_ALLOC_XRCD = 0x80e, + MLX5_CMD_OP_DEALLOC_XRCD = 0x80f, + MLX5_CMD_OP_SET_BURST_SIZE = 0x812, + MLX5_CMD_OP_QUERY_BURST_SZIE = 0x813, + MLX5_CMD_OP_ACTIVATE_TRACER = 0x814, + MLX5_CMD_OP_DEACTIVATE_TRACER = 0x815, + MLX5_CMD_OP_CREATE_SNIFFER_RULE = 0x820, + MLX5_CMD_OP_DESTROY_SNIFFER_RULE = 0x821, + MLX5_CMD_OP_QUERY_CONG_PARAMS = 0x822, + MLX5_CMD_OP_MODIFY_CONG_PARAMS = 0x823, + MLX5_CMD_OP_QUERY_CONG_STATISTICS = 0x824, + MLX5_CMD_OP_CREATE_TIR = 0x900, + MLX5_CMD_OP_MODIFY_TIR = 0x901, + MLX5_CMD_OP_DESTROY_TIR = 0x902, + MLX5_CMD_OP_QUERY_TIR = 0x903, + MLX5_CMD_OP_CREATE_TIS = 0x912, + MLX5_CMD_OP_MODIFY_TIS = 0x913, + MLX5_CMD_OP_DESTROY_TIS = 0x914, + MLX5_CMD_OP_QUERY_TIS = 0x915, + MLX5_CMD_OP_CREATE_SQ = 0x904, + MLX5_CMD_OP_MODIFY_SQ = 0x905, + MLX5_CMD_OP_DESTROY_SQ = 0x906, + MLX5_CMD_OP_QUERY_SQ = 0x907, + MLX5_CMD_OP_CREATE_RQ = 0x908, + MLX5_CMD_OP_MODIFY_RQ = 0x909, + MLX5_CMD_OP_DESTROY_RQ = 0x90a, + MLX5_CMD_OP_QUERY_RQ = 0x90b, + MLX5_CMD_OP_CREATE_RMP = 0x90c, + MLX5_CMD_OP_MODIFY_RMP = 0x90d, + MLX5_CMD_OP_DESTROY_RMP = 0x90e, + MLX5_CMD_OP_QUERY_RMP = 0x90f, + MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY = 0x910, + MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY = 0x911, + MLX5_CMD_OP_MAX = 0x911 +}; + +#endif /* MLX5_IFC_H */ -- cgit v1.2.3-59-g8ed1b From b775516b042f9e35f856bd2914afefd9d23021d7 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 2 Oct 2014 12:19:44 +0300 Subject: net/mlx5_core: use set/get macros in device caps Transform device capabilities related commands to use set/get macros to manipulate command mailboxes. Signed-off-by: Eli Cohen Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 17 ++ drivers/net/ethernet/mellanox/mlx5/core/main.c | 150 +++++++++--------- include/linux/mlx5/device.h | 92 ----------- include/linux/mlx5/driver.h | 1 + include/linux/mlx5/mlx5_ifc.h | 206 +++++++++++++++++++++++++ 5 files changed, 298 insertions(+), 168 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 3ecef1310bae..368c6c5ea014 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -1537,3 +1537,20 @@ int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr) return cmd_status_to_err(hdr->status); } + +int mlx5_cmd_status_to_err_v2(void *ptr) +{ + u32 syndrome; + u8 status; + + status = be32_to_cpu(*(__be32 *)ptr) >> 24; + if (!status) + return 0; + + syndrome = be32_to_cpu(*(__be32 *)(ptr + 4)); + + pr_warn("command failed, status %s(0x%x), syndrome 0x%x\n", + cmd_status_str(status), status, syndrome); + + return cmd_status_to_err(status); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index d9f74618befa..b9e3259e415f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -43,6 +43,7 @@ #include #include #include +#include #include "mlx5_core.h" #define DRIVER_NAME "mlx5_core" @@ -277,18 +278,20 @@ static u16 to_fw_pkey_sz(u32 size) /* selectively copy writable fields clearing any reserved area */ -static void copy_rw_fields(struct mlx5_hca_cap *to, struct mlx5_general_caps *from) +static void copy_rw_fields(void *to, struct mlx5_caps *from) { + __be64 *flags_off = (__be64 *)MLX5_ADDR_OF(cmd_hca_cap, to, reserved_22); u64 v64; - to->log_max_qp = from->log_max_qp & 0x1f; - to->log_max_ra_req_dc = from->log_max_ra_req_dc & 0x3f; - to->log_max_ra_res_dc = from->log_max_ra_res_dc & 0x3f; - to->log_max_ra_req_qp = from->log_max_ra_req_qp & 0x3f; - to->log_max_ra_res_qp = from->log_max_ra_res_qp & 0x3f; - to->pkey_table_size = cpu_to_be16(to_fw_pkey_sz(from->pkey_table_size)); - v64 = from->flags & MLX5_CAP_BITS_RW_MASK; - to->flags = cpu_to_be64(v64); + MLX5_SET(cmd_hca_cap, to, log_max_qp, from->gen.log_max_qp); + MLX5_SET(cmd_hca_cap, to, log_max_ra_req_qp, from->gen.log_max_ra_req_qp); + MLX5_SET(cmd_hca_cap, to, log_max_ra_res_qp, from->gen.log_max_ra_res_qp); + MLX5_SET(cmd_hca_cap, to, pkey_table_size, from->gen.pkey_table_size); + MLX5_SET(cmd_hca_cap, to, log_max_ra_req_dc, from->gen.log_max_ra_req_dc); + MLX5_SET(cmd_hca_cap, to, log_max_ra_res_dc, from->gen.log_max_ra_res_dc); + MLX5_SET(cmd_hca_cap, to, pkey_table_size, to_fw_pkey_sz(from->gen.pkey_table_size)); + v64 = from->gen.flags & MLX5_CAP_BITS_RW_MASK; + *flags_off = cpu_to_be64(v64); } static u16 get_pkey_table_size(int pkey) @@ -299,55 +302,47 @@ static u16 get_pkey_table_size(int pkey) return MLX5_MIN_PKEY_TABLE_SIZE << pkey; } -static void fw2drv_caps(struct mlx5_caps *caps, - struct mlx5_cmd_query_hca_cap_mbox_out *out) +static void fw2drv_caps(struct mlx5_caps *caps, void *out) { struct mlx5_general_caps *gen = &caps->gen; - u16 t16; - - gen->max_srq_wqes = 1 << out->hca_cap.log_max_srq_sz; - gen->max_wqes = 1 << out->hca_cap.log_max_qp_sz; - gen->log_max_qp = out->hca_cap.log_max_qp & 0x1f; - gen->log_max_strq = out->hca_cap.log_max_strq_sz; - gen->log_max_srq = out->hca_cap.log_max_srqs & 0x1f; - gen->max_cqes = 1 << out->hca_cap.log_max_cq_sz; - gen->log_max_cq = out->hca_cap.log_max_cq & 0x1f; - gen->max_eqes = out->hca_cap.log_max_eq_sz; - gen->log_max_mkey = out->hca_cap.log_max_mkey & 0x3f; - gen->log_max_eq = out->hca_cap.log_max_eq & 0xf; - gen->max_indirection = out->hca_cap.max_indirection; - gen->log_max_mrw_sz = out->hca_cap.log_max_mrw_sz; - gen->log_max_bsf_list_size = 0; - gen->log_max_klm_list_size = 0; - gen->log_max_ra_req_dc = out->hca_cap.log_max_ra_req_dc; - gen->log_max_ra_res_dc = out->hca_cap.log_max_ra_res_dc; - gen->log_max_ra_req_qp = out->hca_cap.log_max_ra_req_qp; - gen->log_max_ra_res_qp = out->hca_cap.log_max_ra_res_qp; - gen->max_qp_counters = be16_to_cpu(out->hca_cap.max_qp_count); - gen->pkey_table_size = get_pkey_table_size(be16_to_cpu(out->hca_cap.pkey_table_size)); - gen->local_ca_ack_delay = out->hca_cap.local_ca_ack_delay & 0x1f; - gen->num_ports = out->hca_cap.num_ports & 0xf; - gen->log_max_msg = out->hca_cap.log_max_msg & 0x1f; - gen->stat_rate_support = be16_to_cpu(out->hca_cap.stat_rate_support); - gen->flags = be64_to_cpu(out->hca_cap.flags); - pr_debug("flags = 0x%llx\n", gen->flags); - gen->uar_sz = out->hca_cap.uar_sz; - gen->min_log_pg_sz = out->hca_cap.log_pg_sz; - t16 = be16_to_cpu(out->hca_cap.bf_log_bf_reg_size); - if (t16 & 0x8000) { - gen->bf_reg_size = 1 << (t16 & 0x1f); - gen->bf_regs_per_page = MLX5_BF_REGS_PER_PAGE; - } else { - gen->bf_reg_size = 0; - gen->bf_regs_per_page = 0; - } - gen->max_sq_desc_sz = be16_to_cpu(out->hca_cap.max_desc_sz_sq); - gen->max_rq_desc_sz = be16_to_cpu(out->hca_cap.max_desc_sz_rq); - gen->max_qp_mcg = be32_to_cpu(out->hca_cap.max_qp_mcg) & 0xffffff; - gen->log_max_pd = out->hca_cap.log_max_pd & 0x1f; - gen->log_max_xrcd = out->hca_cap.log_max_xrcd; - gen->log_uar_page_sz = be16_to_cpu(out->hca_cap.log_uar_page_sz); + gen->max_srq_wqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_srq_sz); + gen->max_wqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_qp_sz); + gen->log_max_qp = MLX5_GET_PR(cmd_hca_cap, out, log_max_qp); + gen->log_max_strq = MLX5_GET_PR(cmd_hca_cap, out, log_max_strq_sz); + gen->log_max_srq = MLX5_GET_PR(cmd_hca_cap, out, log_max_srqs); + gen->max_cqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_cq_sz); + gen->log_max_cq = MLX5_GET_PR(cmd_hca_cap, out, log_max_cq); + gen->max_eqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_eq_sz); + gen->log_max_mkey = MLX5_GET_PR(cmd_hca_cap, out, log_max_mkey); + gen->log_max_eq = MLX5_GET_PR(cmd_hca_cap, out, log_max_eq); + gen->max_indirection = MLX5_GET_PR(cmd_hca_cap, out, max_indirection); + gen->log_max_mrw_sz = MLX5_GET_PR(cmd_hca_cap, out, log_max_mrw_sz); + gen->log_max_bsf_list_size = MLX5_GET_PR(cmd_hca_cap, out, log_max_bsf_list_size); + gen->log_max_klm_list_size = MLX5_GET_PR(cmd_hca_cap, out, log_max_klm_list_size); + gen->log_max_ra_req_dc = MLX5_GET_PR(cmd_hca_cap, out, log_max_ra_req_dc); + gen->log_max_ra_res_dc = MLX5_GET_PR(cmd_hca_cap, out, log_max_ra_res_dc); + gen->log_max_ra_req_qp = MLX5_GET_PR(cmd_hca_cap, out, log_max_ra_req_qp); + gen->log_max_ra_res_qp = MLX5_GET_PR(cmd_hca_cap, out, log_max_ra_res_qp); + gen->max_qp_counters = MLX5_GET_PR(cmd_hca_cap, out, max_qp_cnt); + gen->pkey_table_size = get_pkey_table_size(MLX5_GET_PR(cmd_hca_cap, out, pkey_table_size)); + gen->local_ca_ack_delay = MLX5_GET_PR(cmd_hca_cap, out, local_ca_ack_delay); + gen->num_ports = MLX5_GET_PR(cmd_hca_cap, out, num_ports); + gen->log_max_msg = MLX5_GET_PR(cmd_hca_cap, out, log_max_msg); + gen->stat_rate_support = MLX5_GET_PR(cmd_hca_cap, out, stat_rate_support); + gen->flags = be64_to_cpu(*(__be64 *)MLX5_ADDR_OF(cmd_hca_cap, out, reserved_22)); + pr_debug("flags = 0x%llx\n", gen->flags); + gen->uar_sz = MLX5_GET_PR(cmd_hca_cap, out, uar_sz); + gen->min_log_pg_sz = MLX5_GET_PR(cmd_hca_cap, out, log_pg_sz); + gen->bf_reg_size = MLX5_GET_PR(cmd_hca_cap, out, bf); + gen->bf_reg_size = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_bf_reg_size); + gen->max_sq_desc_sz = MLX5_GET_PR(cmd_hca_cap, out, max_wqe_sz_sq); + gen->max_rq_desc_sz = MLX5_GET_PR(cmd_hca_cap, out, max_wqe_sz_rq); + gen->max_dc_sq_desc_sz = MLX5_GET_PR(cmd_hca_cap, out, max_wqe_sz_sq_dc); + gen->max_qp_mcg = MLX5_GET_PR(cmd_hca_cap, out, max_qp_mcg); + gen->log_max_pd = MLX5_GET_PR(cmd_hca_cap, out, log_max_pd); + gen->log_max_xrcd = MLX5_GET_PR(cmd_hca_cap, out, log_max_xrcd); + gen->log_uar_page_sz = MLX5_GET_PR(cmd_hca_cap, out, log_uar_page_sz); } static const char *caps_opmod_str(u16 opmod) @@ -365,59 +360,61 @@ static const char *caps_opmod_str(u16 opmod) int mlx5_core_get_caps(struct mlx5_core_dev *dev, struct mlx5_caps *caps, u16 opmod) { - struct mlx5_cmd_query_hca_cap_mbox_out *out; - struct mlx5_cmd_query_hca_cap_mbox_in in; + u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)]; + int out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); + void *out; int err; - memset(&in, 0, sizeof(in)); - out = kzalloc(sizeof(*out), GFP_KERNEL); + memset(in, 0, sizeof(in)); + out = kzalloc(out_sz, GFP_KERNEL); if (!out) return -ENOMEM; + MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + MLX5_SET(query_hca_cap_in, in, op_mod, opmod); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz); + if (err) + goto query_ex; - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_HCA_CAP); - in.hdr.opmod = cpu_to_be16(opmod); - err = mlx5_cmd_exec(dev, &in, sizeof(in), out, sizeof(*out)); - - err = mlx5_cmd_status_to_err(&out->hdr); + err = mlx5_cmd_status_to_err_v2(out); if (err) { mlx5_core_warn(dev, "query max hca cap failed, %d\n", err); goto query_ex; } mlx5_core_dbg(dev, "%s\n", caps_opmod_str(opmod)); - fw2drv_caps(caps, out); + fw2drv_caps(caps, MLX5_ADDR_OF(query_hca_cap_out, out, capability_struct)); query_ex: kfree(out); return err; } -static int set_caps(struct mlx5_core_dev *dev, - struct mlx5_cmd_set_hca_cap_mbox_in *in) +static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz) { - struct mlx5_cmd_set_hca_cap_mbox_out out; + u32 out[MLX5_ST_SZ_DW(set_hca_cap_out)]; int err; - memset(&out, 0, sizeof(out)); + memset(out, 0, sizeof(out)); - in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_SET_HCA_CAP); - err = mlx5_cmd_exec(dev, in, sizeof(*in), &out, sizeof(out)); + MLX5_SET(set_hca_cap_in, in, opcode, MLX5_CMD_OP_SET_HCA_CAP); + err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out)); if (err) return err; - err = mlx5_cmd_status_to_err(&out.hdr); + err = mlx5_cmd_status_to_err_v2(out); return err; } static int handle_hca_cap(struct mlx5_core_dev *dev) { - struct mlx5_cmd_set_hca_cap_mbox_in *set_ctx = NULL; + void *set_ctx = NULL; struct mlx5_profile *prof = dev->profile; struct mlx5_caps *cur_caps = NULL; struct mlx5_caps *max_caps = NULL; int err = -ENOMEM; + int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in); - set_ctx = kzalloc(sizeof(*set_ctx), GFP_KERNEL); + set_ctx = kzalloc(set_sz, GFP_KERNEL); if (!set_ctx) goto query_ex; @@ -446,8 +443,9 @@ static int handle_hca_cap(struct mlx5_core_dev *dev) /* disable checksum */ cur_caps->gen.flags &= ~MLX5_DEV_CAP_FLAG_CMDIF_CSUM; - copy_rw_fields(&set_ctx->hca_cap, &cur_caps->gen); - err = set_caps(dev, set_ctx); + copy_rw_fields(MLX5_ADDR_OF(set_hca_cap_in, set_ctx, hca_capability_struct), + cur_caps); + err = set_caps(dev, set_ctx, set_sz); query_ex: kfree(cur_caps); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 0032687f58c7..1d67fd32e71c 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -327,98 +327,6 @@ struct mlx5_cmd_query_adapter_mbox_out { u8 vsd_psid[16]; }; -struct mlx5_hca_cap { - u8 rsvd1[16]; - u8 log_max_srq_sz; - u8 log_max_qp_sz; - u8 rsvd2; - u8 log_max_qp; - u8 log_max_strq_sz; - u8 log_max_srqs; - u8 rsvd4[2]; - u8 rsvd5; - u8 log_max_cq_sz; - u8 rsvd6; - u8 log_max_cq; - u8 log_max_eq_sz; - u8 log_max_mkey; - u8 rsvd7; - u8 log_max_eq; - u8 max_indirection; - u8 log_max_mrw_sz; - u8 log_max_bsf_list_sz; - u8 log_max_klm_list_sz; - u8 rsvd_8_0; - u8 log_max_ra_req_dc; - u8 rsvd_8_1; - u8 log_max_ra_res_dc; - u8 rsvd9; - u8 log_max_ra_req_qp; - u8 rsvd10; - u8 log_max_ra_res_qp; - u8 pad_cap; - u8 rsvd11[3]; - __be16 max_qp_count; - __be16 pkey_table_size; - u8 rsvd13; - u8 local_ca_ack_delay; - u8 rsvd14; - u8 num_ports; - u8 log_max_msg; - u8 rsvd15[3]; - __be16 stat_rate_support; - u8 rsvd16[2]; - __be64 flags; - u8 rsvd17; - u8 uar_sz; - u8 rsvd18; - u8 log_pg_sz; - __be16 bf_log_bf_reg_size; - u8 rsvd19[4]; - __be16 max_desc_sz_sq; - u8 rsvd20[2]; - __be16 max_desc_sz_rq; - u8 rsvd21[2]; - __be16 max_desc_sz_sq_dc; - __be32 max_qp_mcg; - u8 rsvd22[3]; - u8 log_max_mcg; - u8 rsvd23; - u8 log_max_pd; - u8 rsvd24; - u8 log_max_xrcd; - u8 rsvd25[42]; - __be16 log_uar_page_sz; - u8 rsvd26[108]; -}; - - -struct mlx5_cmd_query_hca_cap_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd[8]; -}; - - -struct mlx5_cmd_query_hca_cap_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd0[8]; - struct mlx5_hca_cap hca_cap; -}; - - -struct mlx5_cmd_set_hca_cap_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd[8]; - struct mlx5_hca_cap hca_cap; -}; - - -struct mlx5_cmd_set_hca_cap_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd0[8]; -}; - - struct mlx5_cmd_init_hca_mbox_in { struct mlx5_inbox_hdr hdr; u8 rsvd0[2]; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 6f48dc793b9f..c439f9c59b93 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -641,6 +641,7 @@ void mlx5_cmd_cleanup(struct mlx5_core_dev *dev); void mlx5_cmd_use_events(struct mlx5_core_dev *dev); void mlx5_cmd_use_polling(struct mlx5_core_dev *dev); int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr); +int mlx5_cmd_status_to_err_v2(void *ptr); int mlx5_core_get_caps(struct mlx5_core_dev *dev, struct mlx5_caps *caps, u16 opmod); int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index df3bd9b5fbcf..5f48b8f592c5 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -140,4 +140,210 @@ enum { MLX5_CMD_OP_MAX = 0x911 }; +struct mlx5_ifc_cmd_hca_cap_bits { + u8 reserved_0[0x80]; + + u8 log_max_srq_sz[0x8]; + u8 log_max_qp_sz[0x8]; + u8 reserved_1[0xb]; + u8 log_max_qp[0x5]; + + u8 log_max_strq_sz[0x8]; + u8 reserved_2[0x3]; + u8 log_max_srqs[0x5]; + u8 reserved_3[0x10]; + + u8 reserved_4[0x8]; + u8 log_max_cq_sz[0x8]; + u8 reserved_5[0xb]; + u8 log_max_cq[0x5]; + + u8 log_max_eq_sz[0x8]; + u8 reserved_6[0x2]; + u8 log_max_mkey[0x6]; + u8 reserved_7[0xc]; + u8 log_max_eq[0x4]; + + u8 max_indirection[0x8]; + u8 reserved_8[0x1]; + u8 log_max_mrw_sz[0x7]; + u8 reserved_9[0x2]; + u8 log_max_bsf_list_size[0x6]; + u8 reserved_10[0x2]; + u8 log_max_klm_list_size[0x6]; + + u8 reserved_11[0xa]; + u8 log_max_ra_req_dc[0x6]; + u8 reserved_12[0xa]; + u8 log_max_ra_res_dc[0x6]; + + u8 reserved_13[0xa]; + u8 log_max_ra_req_qp[0x6]; + u8 reserved_14[0xa]; + u8 log_max_ra_res_qp[0x6]; + + u8 pad_cap[0x1]; + u8 cc_query_allowed[0x1]; + u8 cc_modify_allowed[0x1]; + u8 reserved_15[0x1d]; + + u8 reserved_16[0x6]; + u8 max_qp_cnt[0xa]; + u8 pkey_table_size[0x10]; + + u8 eswitch_owner[0x1]; + u8 reserved_17[0xa]; + u8 local_ca_ack_delay[0x5]; + u8 reserved_18[0x8]; + u8 num_ports[0x8]; + + u8 reserved_19[0x3]; + u8 log_max_msg[0x5]; + u8 reserved_20[0x18]; + + u8 stat_rate_support[0x10]; + u8 reserved_21[0x10]; + + u8 reserved_22[0x10]; + u8 cmdif_checksum[0x2]; + u8 sigerr_cqe[0x1]; + u8 reserved_23[0x1]; + u8 wq_signature[0x1]; + u8 sctr_data_cqe[0x1]; + u8 reserved_24[0x1]; + u8 sho[0x1]; + u8 tph[0x1]; + u8 rf[0x1]; + u8 dc[0x1]; + u8 reserved_25[0x2]; + u8 roce[0x1]; + u8 atomic[0x1]; + u8 rsz_srq[0x1]; + + u8 cq_oi[0x1]; + u8 cq_resize[0x1]; + u8 cq_moderation[0x1]; + u8 sniffer_rule_flow[0x1]; + u8 sniffer_rule_vport[0x1]; + u8 sniffer_rule_phy[0x1]; + u8 reserved_26[0x1]; + u8 pg[0x1]; + u8 block_lb_mc[0x1]; + u8 reserved_27[0x3]; + u8 cd[0x1]; + u8 reserved_28[0x1]; + u8 apm[0x1]; + u8 reserved_29[0x7]; + u8 qkv[0x1]; + u8 pkv[0x1]; + u8 reserved_30[0x4]; + u8 xrc[0x1]; + u8 ud[0x1]; + u8 uc[0x1]; + u8 rc[0x1]; + + u8 reserved_31[0xa]; + u8 uar_sz[0x6]; + u8 reserved_32[0x8]; + u8 log_pg_sz[0x8]; + + u8 bf[0x1]; + u8 reserved_33[0xa]; + u8 log_bf_reg_size[0x5]; + u8 reserved_34[0x10]; + + u8 reserved_35[0x10]; + u8 max_wqe_sz_sq[0x10]; + + u8 reserved_36[0x10]; + u8 max_wqe_sz_rq[0x10]; + + u8 reserved_37[0x10]; + u8 max_wqe_sz_sq_dc[0x10]; + + u8 reserved_38[0x7]; + u8 max_qp_mcg[0x19]; + + u8 reserved_39[0x18]; + u8 log_max_mcg[0x8]; + + u8 reserved_40[0xb]; + u8 log_max_pd[0x5]; + u8 reserved_41[0xb]; + u8 log_max_xrcd[0x5]; + + u8 reserved_42[0x20]; + + u8 reserved_43[0x3]; + u8 log_max_rq[0x5]; + u8 reserved_44[0x3]; + u8 log_max_sq[0x5]; + u8 reserved_45[0x3]; + u8 log_max_tir[0x5]; + u8 reserved_46[0x3]; + u8 log_max_tis[0x5]; + + u8 reserved_47[0x13]; + u8 log_max_rq_per_tir[0x5]; + u8 reserved_48[0x3]; + u8 log_max_tis_per_sq[0x5]; + + u8 reserved_49[0xe0]; + + u8 reserved_50[0x10]; + u8 log_uar_page_sz[0x10]; + + u8 reserved_51[0x100]; + + u8 reserved_52[0x1f]; + u8 cqe_zip[0x1]; + + u8 cqe_zip_timeout[0x10]; + u8 cqe_zip_max_num[0x10]; + + u8 reserved_53[0x220]; +}; + +struct mlx5_ifc_set_hca_cap_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; + + struct mlx5_ifc_cmd_hca_cap_bits hca_capability_struct; +}; + +struct mlx5_ifc_query_hca_cap_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; +}; + +struct mlx5_ifc_query_hca_cap_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + u8 capability_struct[256][0x8]; +}; + +struct mlx5_ifc_set_hca_cap_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + #endif /* MLX5_IFC_H */ -- cgit v1.2.3-59-g8ed1b From 5903325a64834211daf63a62db3b35ee580cb8bf Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 2 Oct 2014 12:19:45 +0300 Subject: net/mlx5_core: Identify resources by their type This patch puts a common part as the first field of mlx5_core_qp. This field is used to identify which resource generated an event. This is required since upcoming new resource types such as DC targets are allocated for the same numerical space as regular QPs and may generate the same events. By searching the resource in the same table we can then look at the common field to identify the resource. Signed-off-by: Eli Cohen Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 12 +++--- drivers/net/ethernet/mellanox/mlx5/core/qp.c | 57 ++++++++++++++++++++-------- include/linux/mlx5/driver.h | 13 ++++++- include/linux/mlx5/qp.h | 3 +- 4 files changed, 60 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 11b9b840ad4d..ed53291468f3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -198,7 +198,7 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq) int eqes_found = 0; int set_ci = 0; u32 cqn; - u32 srqn; + u32 rsn; u8 port; while ((eqe = next_eqe_sw(eq))) { @@ -224,18 +224,18 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq) case MLX5_EVENT_TYPE_PATH_MIG_FAILED: case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; mlx5_core_dbg(dev, "event %s(%d) arrived\n", eqe_type_str(eqe->type), eqe->type); - mlx5_qp_event(dev, be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff, - eqe->type); + mlx5_rsc_event(dev, rsn, eqe->type); break; case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: - srqn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; + rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; mlx5_core_dbg(dev, "SRQ event %s(%d): srqn 0x%x\n", - eqe_type_str(eqe->type), eqe->type, srqn); - mlx5_srq_event(dev, srqn, eqe->type); + eqe_type_str(eqe->type), eqe->type, rsn); + mlx5_srq_event(dev, rsn, eqe->type); break; case MLX5_EVENT_TYPE_CMD: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index 415b67ce379e..5261a2b0da43 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c @@ -39,28 +39,53 @@ #include "mlx5_core.h" -void mlx5_qp_event(struct mlx5_core_dev *dev, u32 qpn, int event_type) +static struct mlx5_core_rsc_common *mlx5_get_rsc(struct mlx5_core_dev *dev, + u32 rsn) { struct mlx5_qp_table *table = &dev->priv.qp_table; - struct mlx5_core_qp *qp; + struct mlx5_core_rsc_common *common; spin_lock(&table->lock); - qp = radix_tree_lookup(&table->tree, qpn); - if (qp) - atomic_inc(&qp->refcount); + common = radix_tree_lookup(&table->tree, rsn); + if (common) + atomic_inc(&common->refcount); spin_unlock(&table->lock); - if (!qp) { - mlx5_core_warn(dev, "Async event for bogus QP 0x%x\n", qpn); - return; + if (!common) { + mlx5_core_warn(dev, "Async event for bogus resource 0x%x\n", + rsn); + return NULL; } + return common; +} - qp->event(qp, event_type); +void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common) +{ + if (atomic_dec_and_test(&common->refcount)) + complete(&common->free); +} + +void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type) +{ + struct mlx5_core_rsc_common *common = mlx5_get_rsc(dev, rsn); + struct mlx5_core_qp *qp; + + if (!common) + return; + + switch (common->res) { + case MLX5_RES_QP: + qp = (struct mlx5_core_qp *)common; + qp->event(qp, event_type); + break; + + default: + mlx5_core_warn(dev, "invalid resource type for 0x%x\n", rsn); + } - if (atomic_dec_and_test(&qp->refcount)) - complete(&qp->free); + mlx5_core_put_rsc(common); } int mlx5_core_create_qp(struct mlx5_core_dev *dev, @@ -92,6 +117,7 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev, qp->qpn = be32_to_cpu(out.qpn) & 0xffffff; mlx5_core_dbg(dev, "qpn = 0x%x\n", qp->qpn); + qp->common.res = MLX5_RES_QP; spin_lock_irq(&table->lock); err = radix_tree_insert(&table->tree, qp->qpn, qp); spin_unlock_irq(&table->lock); @@ -106,9 +132,9 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev, qp->qpn); qp->pid = current->pid; - atomic_set(&qp->refcount, 1); + atomic_set(&qp->common.refcount, 1); atomic_inc(&dev->num_qps); - init_completion(&qp->free); + init_completion(&qp->common.free); return 0; @@ -138,9 +164,8 @@ int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, radix_tree_delete(&table->tree, qp->qpn); spin_unlock_irqrestore(&table->lock, flags); - if (atomic_dec_and_test(&qp->refcount)) - complete(&qp->free); - wait_for_completion(&qp->free); + mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp); + wait_for_completion(&qp->common.free); memset(&in, 0, sizeof(in)); memset(&out, 0, sizeof(out)); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index c439f9c59b93..246310dc8bef 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -375,6 +375,16 @@ struct mlx5_core_mr { u32 pd; }; +enum mlx5_res_type { + MLX5_RES_QP, +}; + +struct mlx5_core_rsc_common { + enum mlx5_res_type res; + atomic_t refcount; + struct completion free; +}; + struct mlx5_core_srq { u32 srqn; int max; @@ -700,7 +710,7 @@ int mlx5_eq_init(struct mlx5_core_dev *dev); void mlx5_eq_cleanup(struct mlx5_core_dev *dev); void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas); void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn); -void mlx5_qp_event(struct mlx5_core_dev *dev, u32 qpn, int event_type); +void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type); void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type); struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn); void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector); @@ -737,6 +747,7 @@ void mlx5_cmdif_debugfs_cleanup(struct mlx5_core_dev *dev); int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn, int npsvs, u32 *sig_index); int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num); +void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common); static inline u32 mlx5_mkey_to_idx(u32 mkey) { diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 9709b30e2d69..7c4c0f1f5805 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -342,10 +342,9 @@ struct mlx5_stride_block_ctrl_seg { }; struct mlx5_core_qp { + struct mlx5_core_rsc_common common; /* must be first */ void (*event) (struct mlx5_core_qp *, int); int qpn; - atomic_t refcount; - struct completion free; struct mlx5_rsc_debug *dbg; int pid; }; -- cgit v1.2.3-59-g8ed1b From efc98d08e1ec4fd131f794370b274dceaf32c958 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 3 Oct 2014 15:48:08 -0700 Subject: fou: eliminate IPv4,v6 specific GRO functions This patch removes fou[46]_gro_receive and fou[46]_gro_complete functions. The v4 or v6 variants were chosen for the UDP offloads based on the address family of the socket this is not necessary or correct. Alternatively, this patch adds is_ipv6 to napi_gro_skb. This is set in udp6_gro_receive and unset in udp4_gro_receive. In fou_gro_receive the value is used to select the correct inet_offloads for the protocol of the outer IP header. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ net/ipv4/fou.c | 48 ++++++++--------------------------------------- net/ipv4/udp_offload.c | 1 + net/ipv6/udp_offload.c | 1 + 4 files changed, 13 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 910fb17ad148..22d54b9b700d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1886,6 +1886,9 @@ struct napi_gro_cb { /* Number of checksums via CHECKSUM_UNNECESSARY */ u8 csum_cnt:3; + /* Used in foo-over-udp, set in udp[46]_gro_receive */ + u8 is_ipv6:1; + /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index dced89fbe480..7e2126a31f2e 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -65,14 +65,15 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) } static struct sk_buff **fou_gro_receive(struct sk_buff **head, - struct sk_buff *skb, - const struct net_offload **offloads) + struct sk_buff *skb) { const struct net_offload *ops; struct sk_buff **pp = NULL; u8 proto = NAPI_GRO_CB(skb)->proto; + const struct net_offload **offloads; rcu_read_lock(); + offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); if (!ops || !ops->callbacks.gro_receive) goto out_unlock; @@ -85,14 +86,15 @@ out_unlock: return pp; } -static int fou_gro_complete(struct sk_buff *skb, int nhoff, - const struct net_offload **offloads) +static int fou_gro_complete(struct sk_buff *skb, int nhoff) { const struct net_offload *ops; u8 proto = NAPI_GRO_CB(skb)->proto; int err = -ENOSYS; + const struct net_offload **offloads; rcu_read_lock(); + offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out_unlock; @@ -105,28 +107,6 @@ out_unlock: return err; } -static struct sk_buff **fou4_gro_receive(struct sk_buff **head, - struct sk_buff *skb) -{ - return fou_gro_receive(head, skb, inet_offloads); -} - -static int fou4_gro_complete(struct sk_buff *skb, int nhoff) -{ - return fou_gro_complete(skb, nhoff, inet_offloads); -} - -static struct sk_buff **fou6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) -{ - return fou_gro_receive(head, skb, inet6_offloads); -} - -static int fou6_gro_complete(struct sk_buff *skb, int nhoff) -{ - return fou_gro_complete(skb, nhoff, inet6_offloads); -} - static int fou_add_to_port_list(struct fou *fou) { struct fou *fout; @@ -199,20 +179,8 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, sk->sk_allocation = GFP_ATOMIC; - switch (cfg->udp_config.family) { - case AF_INET: - fou->udp_offloads.callbacks.gro_receive = fou4_gro_receive; - fou->udp_offloads.callbacks.gro_complete = fou4_gro_complete; - break; - case AF_INET6: - fou->udp_offloads.callbacks.gro_receive = fou6_gro_receive; - fou->udp_offloads.callbacks.gro_complete = fou6_gro_complete; - break; - default: - err = -EPFNOSUPPORT; - goto error; - } - + fou->udp_offloads.callbacks.gro_receive = fou_gro_receive; + fou->udp_offloads.callbacks.gro_complete = fou_gro_complete; fou->udp_offloads.port = cfg->udp_config.local_udp_port; fou->udp_offloads.ipproto = cfg->protocol; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 8c35f2c939ee..507310ef4b56 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -334,6 +334,7 @@ static struct sk_buff **udp4_gro_receive(struct sk_buff **head, skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check, inet_gro_compute_pseudo); skip: + NAPI_GRO_CB(skb)->is_ipv6 = 0; return udp_gro_receive(head, skb, uh); flush: diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 8f96988c1db2..6b8f543f6ac6 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -140,6 +140,7 @@ static struct sk_buff **udp6_gro_receive(struct sk_buff **head, ip6_gro_compute_pseudo); skip: + NAPI_GRO_CB(skb)->is_ipv6 = 1; return udp_gro_receive(head, skb, uh); flush: -- cgit v1.2.3-59-g8ed1b From c8753d55afb436fd6a25c8bbe8d783f6dcf1c9f8 Mon Sep 17 00:00:00 2001 From: Vijay Subramanian Date: Thu, 2 Oct 2014 10:00:43 -0700 Subject: net: Cleanup skb cloning by adding SKB_FCLONE_FREE SKB_FCLONE_UNAVAILABLE has overloaded meaning depending on type of skb. 1: If skb is allocated from head_cache, it indicates fclone is not available. 2: If skb is a companion fclone skb (allocated from fclone_cache), it indicates it is available to be used. To avoid confusion for case 2 above, this patch replaces SKB_FCLONE_UNAVAILABLE with SKB_FCLONE_FREE where appropriate. For fclone companion skbs, this indicates it is free for use. SKB_FCLONE_UNAVAILABLE will now simply indicate skb is from head_cache and cannot / will not have a companion fclone. Signed-off-by: Vijay Subramanian Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 ++++--- net/core/skbuff.c | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7c5036d11feb..3a5ec7638627 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -339,9 +339,10 @@ struct skb_shared_info { enum { - SKB_FCLONE_UNAVAILABLE, - SKB_FCLONE_ORIG, - SKB_FCLONE_CLONE, + SKB_FCLONE_UNAVAILABLE, /* skb has no fclone (from head_cache) */ + SKB_FCLONE_ORIG, /* orig skb (from fclone_cache) */ + SKB_FCLONE_CLONE, /* companion fclone skb (from fclone_cache) */ + SKB_FCLONE_FREE, /* this companion fclone skb is available */ }; enum { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a0b312fa3047..28916e47f959 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -265,7 +265,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->fclone = SKB_FCLONE_ORIG; atomic_set(&fclones->fclone_ref, 1); - fclones->skb2.fclone = SKB_FCLONE_UNAVAILABLE; + fclones->skb2.fclone = SKB_FCLONE_FREE; fclones->skb2.pfmemalloc = pfmemalloc; } out: @@ -542,7 +542,7 @@ static void kfree_skbmem(struct sk_buff *skb) fclones = container_of(skb, struct sk_buff_fclones, skb2); /* Warning : We must perform the atomic_dec_and_test() before - * setting skb->fclone back to SKB_FCLONE_UNAVAILABLE, otherwise + * setting skb->fclone back to SKB_FCLONE_FREE, otherwise * skb_clone() could set clone_ref to 2 before our decrement. * Anyway, if we are going to free the structure, no need to * rewrite skb->fclone. @@ -553,7 +553,7 @@ static void kfree_skbmem(struct sk_buff *skb) /* The clone portion is available for * fast-cloning again. */ - skb->fclone = SKB_FCLONE_UNAVAILABLE; + skb->fclone = SKB_FCLONE_FREE; } break; } @@ -874,7 +874,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) return NULL; if (skb->fclone == SKB_FCLONE_ORIG && - n->fclone == SKB_FCLONE_UNAVAILABLE) { + n->fclone == SKB_FCLONE_FREE) { n->fclone = SKB_FCLONE_CLONE; /* As our fastclone was free, clone_ref must be 1 at this point. * We could use atomic_inc() here, but it is faster -- cgit v1.2.3-59-g8ed1b From 7dfa4b414d4eec8da56e44fb2b4aea3e549b092f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 5 Oct 2014 12:35:09 +0300 Subject: net/mlx4_en: Code cleanups in tx path - Remove unused variable ring->poll_cnt - No need to set some fields if using blueflame - Add missing const's - Use unlikely - Remove unneeded new line - Make some comments more precise - struct mlx4_bf @offset field reduced to unsigned int to save space Signed-off-by: Eric Dumazet Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 49 +++++++++++++++------------- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 1 - include/linux/mlx4/device.h | 2 +- 3 files changed, 27 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 0c501253fdab..eaf23eb7266a 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -191,7 +191,6 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv, ring->prod = 0; ring->cons = 0xffffffff; ring->last_nr_txbb = 1; - ring->poll_cnt = 0; memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info)); memset(ring->buf, 0, ring->buf_size); @@ -512,7 +511,8 @@ static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv, return ring->buf + index * TXBB_SIZE; } -static int is_inline(int inline_thold, struct sk_buff *skb, void **pfrag) +static bool is_inline(int inline_thold, const struct sk_buff *skb, + void **pfrag) { void *ptr; @@ -535,7 +535,7 @@ static int is_inline(int inline_thold, struct sk_buff *skb, void **pfrag) return 0; } -static int inline_size(struct sk_buff *skb) +static int inline_size(const struct sk_buff *skb) { if (skb->len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg) <= MLX4_INLINE_ALIGN) @@ -546,7 +546,8 @@ static int inline_size(struct sk_buff *skb) sizeof(struct mlx4_wqe_inline_seg), 16); } -static int get_real_size(struct sk_buff *skb, struct net_device *dev, +static int get_real_size(const struct sk_buff *skb, + struct net_device *dev, int *lso_header_size) { struct mlx4_en_priv *priv = netdev_priv(dev); @@ -581,8 +582,10 @@ static int get_real_size(struct sk_buff *skb, struct net_device *dev, return real_size; } -static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct sk_buff *skb, - int real_size, u16 *vlan_tag, int tx_ind, void *fragptr) +static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, + const struct sk_buff *skb, + int real_size, u16 *vlan_tag, + int tx_ind, void *fragptr) { struct mlx4_wqe_inline_seg *inl = &tx_desc->inl; int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl; @@ -642,7 +645,8 @@ u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, return fallback(dev, skb) % rings_p_up + up * rings_p_up; } -static void mlx4_bf_copy(void __iomem *dst, unsigned long *src, unsigned bytecnt) +static void mlx4_bf_copy(void __iomem *dst, const void *src, + unsigned int bytecnt) { __iowrite64_copy(dst, src, bytecnt / 8); } @@ -736,11 +740,10 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) tx_info->skb = skb; tx_info->nr_txbb = nr_txbb; + data = &tx_desc->data; if (lso_header_size) data = ((void *)&tx_desc->lso + ALIGN(lso_header_size + 4, DS_SIZE)); - else - data = &tx_desc->data; /* valid only for none inline segments */ tx_info->data_offset = (void *)data - (void *)tx_desc; @@ -753,9 +756,9 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) if (is_inline(ring->inline_thold, skb, &fragptr)) { tx_info->inl = 1; } else { - /* Map fragments */ + /* Map fragments if any */ for (i = skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) { - struct skb_frag_struct *frag; + const struct skb_frag_struct *frag; dma_addr_t dma; frag = &skb_shinfo(skb)->frags[i]; @@ -772,7 +775,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) --data; } - /* Map linear part */ + /* Map linear part if needed */ if (tx_info->linear) { u32 byte_count = skb_headlen(skb) - lso_header_size; dma_addr_t dma; @@ -795,18 +798,14 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) * For timestamping add flag to skb_shinfo and * set flag for further reference */ - if (ring->hwtstamp_tx_type == HWTSTAMP_TX_ON && - skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) { - skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; + if (unlikely(ring->hwtstamp_tx_type == HWTSTAMP_TX_ON && + shinfo->tx_flags & SKBTX_HW_TSTAMP)) { + shinfo->tx_flags |= SKBTX_IN_PROGRESS; tx_info->ts_requested = 1; } /* Prepare ctrl segement apart opcode+ownership, which depends on * whether LSO is used */ - tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag); - tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * - !!vlan_tx_tag_present(skb); - tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f; tx_desc->ctrl.srcrb_flags = priv->ctrl_flags; if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) { tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM | @@ -852,7 +851,6 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0); tx_info->nr_bytes = max_t(unsigned int, skb->len, ETH_ZLEN); ring->packets++; - } ring->bytes += tx_info->nr_bytes; netdev_tx_sent_queue(ring->tx_queue, tx_info->nr_bytes); @@ -874,7 +872,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ring->prod += nr_txbb; /* If we used a bounce buffer then copy descriptor back into place */ - if (bounce) + if (unlikely(bounce)) tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size); skb_tx_timestamp(skb); @@ -894,13 +892,18 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) wmb(); - mlx4_bf_copy(ring->bf.reg + ring->bf.offset, (unsigned long *) &tx_desc->ctrl, - desc_size); + mlx4_bf_copy(ring->bf.reg + ring->bf.offset, &tx_desc->ctrl, + desc_size); wmb(); ring->bf.offset ^= ring->bf.buf_size; } else { + tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag); + tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * + !!vlan_tx_tag_present(skb); + tx_desc->ctrl.fence_size = real_size; + /* Ensure new descriptor hits memory * before setting ownership of this descriptor to HW */ diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index 84c9d5dbfa4f..e54b653de3d4 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -263,7 +263,6 @@ struct mlx4_en_tx_ring { u32 buf_size; u32 doorbell_qpn; void *buf; - u16 poll_cnt; struct mlx4_en_tx_info *tx_info; u8 *bounce_buf; u8 queue_index; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index b2f8ab9a57c4..37e4404d0227 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -583,7 +583,7 @@ struct mlx4_uar { }; struct mlx4_bf { - unsigned long offset; + unsigned int offset; int buf_size; struct mlx4_uar *uar; void __iomem *reg; -- cgit v1.2.3-59-g8ed1b From fcbeb976d7ce783fd58e63e61c196d9a8912b3be Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 5 Oct 2014 10:11:27 -0700 Subject: net: introduce netdevice gso_min_segs attribute Some TSO engines might have a too heavy setup cost, that impacts performance on hosts sending small bursts (2 MSS per packet). This patch adds a device gso_min_segs, allowing drivers to set a minimum segment size for TSO packets, according to the NIC performance. Tested on a mlx4 NIC, this allows to get a ~110% increase of throughput when sending 2 MSS per packet. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +++- net/core/dev.c | 9 ++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 22d54b9b700d..2df86f50261c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1416,6 +1416,8 @@ enum netdev_priv_flags { * @gso_max_size: Maximum size of generic segmentation offload * @gso_max_segs: Maximum number of segments that can be passed to the * NIC for GSO + * @gso_min_segs: Minimum number of segments that can be passed to the + * NIC for GSO * * @dcbnl_ops: Data Center Bridging netlink ops * @num_tc: Number of traffic classes in the net device @@ -1666,7 +1668,7 @@ struct net_device { unsigned int gso_max_size; #define GSO_MAX_SEGS 65535 u16 gso_max_segs; - + u16 gso_min_segs; #ifdef CONFIG_DCB const struct dcbnl_rtnl_ops *dcbnl_ops; #endif diff --git a/net/core/dev.c b/net/core/dev.c index 7d5691cc1f47..c1a4de275576 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2567,10 +2567,12 @@ static netdev_features_t harmonize_features(struct sk_buff *skb, netdev_features_t netif_skb_features(struct sk_buff *skb) { + const struct net_device *dev = skb->dev; + netdev_features_t features = dev->features; + u16 gso_segs = skb_shinfo(skb)->gso_segs; __be16 protocol = skb->protocol; - netdev_features_t features = skb->dev->features; - if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) + if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs) features &= ~NETIF_F_GSO_MASK; if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { @@ -2581,7 +2583,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) } features = netdev_intersect_features(features, - skb->dev->vlan_features | + dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); @@ -6661,6 +6663,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; + dev->gso_min_segs = 0; INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); -- cgit v1.2.3-59-g8ed1b From fd2ef0ba3071c92ac6272ab22ea3f2b16d88a4eb Mon Sep 17 00:00:00 2001 From: Petri Gynther Date: Mon, 6 Oct 2014 11:38:30 -0700 Subject: net: phy: adjust fixed_phy_register() return value Adjust fixed_phy_register() to return struct phy_device *, so that it becomes easy to use fixed PHYs without device tree support: phydev = fixed_phy_register(PHY_POLL, &fixed_phy_status, NULL); fixed_phy_set_link_update(phydev, fixed_phy_link_update); phy_connect_direct(netdev, phydev, handler_fn, phy_interface); This change is a prerequisite for modifying bcmgenet driver to work without a device tree on Broadcom's MIPS-based 7xxx platforms. Signed-off-by: Petri Gynther Signed-off-by: David S. Miller --- drivers/net/phy/fixed.c | 16 ++++++++-------- drivers/of/of_mdio.c | 7 +++++-- include/linux/phy_fixed.h | 14 +++++++------- 3 files changed, 20 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/fixed.c b/drivers/net/phy/fixed.c index 5b19fbbda6d4..47872caa0081 100644 --- a/drivers/net/phy/fixed.c +++ b/drivers/net/phy/fixed.c @@ -233,9 +233,9 @@ EXPORT_SYMBOL_GPL(fixed_phy_del); static int phy_fixed_addr; static DEFINE_SPINLOCK(phy_fixed_addr_lock); -int fixed_phy_register(unsigned int irq, - struct fixed_phy_status *status, - struct device_node *np) +struct phy_device *fixed_phy_register(unsigned int irq, + struct fixed_phy_status *status, + struct device_node *np) { struct fixed_mdio_bus *fmb = &platform_fmb; struct phy_device *phy; @@ -246,19 +246,19 @@ int fixed_phy_register(unsigned int irq, spin_lock(&phy_fixed_addr_lock); if (phy_fixed_addr == PHY_MAX_ADDR) { spin_unlock(&phy_fixed_addr_lock); - return -ENOSPC; + return ERR_PTR(-ENOSPC); } phy_addr = phy_fixed_addr++; spin_unlock(&phy_fixed_addr_lock); ret = fixed_phy_add(PHY_POLL, phy_addr, status); if (ret < 0) - return ret; + return ERR_PTR(ret); phy = get_phy_device(fmb->mii_bus, phy_addr, false); if (!phy || IS_ERR(phy)) { fixed_phy_del(phy_addr); - return -EINVAL; + return ERR_PTR(-EINVAL); } of_node_get(np); @@ -269,10 +269,10 @@ int fixed_phy_register(unsigned int irq, phy_device_free(phy); of_node_put(np); fixed_phy_del(phy_addr); - return ret; + return ERR_PTR(ret); } - return 0; + return phy; } static int __init fixed_mdio_bus_init(void) diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c index a85d80012993..1bd43053b8c7 100644 --- a/drivers/of/of_mdio.c +++ b/drivers/of/of_mdio.c @@ -286,6 +286,7 @@ int of_phy_register_fixed_link(struct device_node *np) struct device_node *fixed_link_node; const __be32 *fixed_link_prop; int len; + struct phy_device *phy; /* New binding */ fixed_link_node = of_get_child_by_name(np, "fixed-link"); @@ -299,7 +300,8 @@ int of_phy_register_fixed_link(struct device_node *np) status.asym_pause = of_property_read_bool(fixed_link_node, "asym-pause"); of_node_put(fixed_link_node); - return fixed_phy_register(PHY_POLL, &status, np); + phy = fixed_phy_register(PHY_POLL, &status, np); + return IS_ERR(phy) ? PTR_ERR(phy) : 0; } /* Old binding */ @@ -310,7 +312,8 @@ int of_phy_register_fixed_link(struct device_node *np) status.speed = be32_to_cpu(fixed_link_prop[2]); status.pause = be32_to_cpu(fixed_link_prop[3]); status.asym_pause = be32_to_cpu(fixed_link_prop[4]); - return fixed_phy_register(PHY_POLL, &status, np); + phy = fixed_phy_register(PHY_POLL, &status, np); + return IS_ERR(phy) ? PTR_ERR(phy) : 0; } return -ENODEV; diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 941138664c1d..f2ca1b459377 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -14,9 +14,9 @@ struct device_node; #ifdef CONFIG_FIXED_PHY extern int fixed_phy_add(unsigned int irq, int phy_id, struct fixed_phy_status *status); -extern int fixed_phy_register(unsigned int irq, - struct fixed_phy_status *status, - struct device_node *np); +extern struct phy_device *fixed_phy_register(unsigned int irq, + struct fixed_phy_status *status, + struct device_node *np); extern void fixed_phy_del(int phy_addr); extern int fixed_phy_set_link_update(struct phy_device *phydev, int (*link_update)(struct net_device *, @@ -27,11 +27,11 @@ static inline int fixed_phy_add(unsigned int irq, int phy_id, { return -ENODEV; } -static inline int fixed_phy_register(unsigned int irq, - struct fixed_phy_status *status, - struct device_node *np) +static inline struct phy_device *fixed_phy_register(unsigned int irq, + struct fixed_phy_status *status, + struct device_node *np) { - return -ENODEV; + return ERR_PTR(-ENODEV); } static inline int fixed_phy_del(int phy_addr) { -- cgit v1.2.3-59-g8ed1b From 0287587884b15041203b3a362d485e1ab1f24445 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 5 Oct 2014 18:38:35 -0700 Subject: net: better IFF_XMIT_DST_RELEASE support Testing xmit_more support with netperf and connected UDP sockets, I found strange dst refcount false sharing. Current handling of IFF_XMIT_DST_RELEASE is not optimal. Dropping dst in validate_xmit_skb() is certainly too late in case packet was queued by cpu X but dequeued by cpu Y The logical point to take care of drop/force is in __dev_queue_xmit() before even taking qdisc lock. As Julian Anastasov pointed out, need for skb_dst() might come from some packet schedulers or classifiers. This patch adds new helper to cleanly express needs of various drivers or qdiscs/classifiers. Drivers that need skb_dst() in their ndo_start_xmit() should call following helper in their setup instead of the prior : dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; -> netif_keep_dst(dev); Instead of using a single bit, we use two bits, one being eventually rebuilt in bonding/team drivers. The other one, is permanent and blocks IFF_XMIT_DST_RELEASE being rebuilt in bonding/team. Eventually, we could add something smarter later. Signed-off-by: Eric Dumazet Cc: Julian Anastasov Signed-off-by: David S. Miller --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 +- drivers/net/appletalk/ipddp.c | 2 +- drivers/net/bonding/bond_main.c | 9 ++++++--- drivers/net/eql.c | 2 +- drivers/net/ifb.c | 3 ++- drivers/net/loopback.c | 2 +- drivers/net/macvlan.c | 3 ++- drivers/net/ppp/ppp_generic.c | 2 +- drivers/net/team/team.c | 8 +++++--- drivers/net/vxlan.c | 2 +- drivers/net/wan/hdlc_fr.c | 2 +- drivers/s390/net/qeth_l3_main.c | 2 +- include/linux/netdevice.h | 8 ++++++++ net/8021q/vlan_dev.c | 3 ++- net/atm/clip.c | 2 +- net/core/dev.c | 19 +++++++++---------- net/ipv4/ip_gre.c | 2 +- net/ipv4/ip_vti.c | 2 +- net/ipv4/ipip.c | 2 +- net/ipv6/ip6_gre.c | 2 +- net/ipv6/ip6_tunnel.c | 2 +- net/ipv6/ip6_vti.c | 2 +- net/ipv6/sit.c | 2 +- net/sched/cls_flow.c | 2 ++ net/sched/cls_route.c | 1 + net/sched/sch_generic.c | 3 --- net/sched/sch_teql.c | 2 +- 27 files changed, 54 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 13e6e0431592..58b5aa3b6f2d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1364,7 +1364,7 @@ void ipoib_setup(struct net_device *dev) dev->tx_queue_len = ipoib_sendq_size * 2; dev->features = (NETIF_F_VLAN_CHALLENGED | NETIF_F_HIGHDMA); - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); diff --git a/drivers/net/appletalk/ipddp.c b/drivers/net/appletalk/ipddp.c index 10d0dba572c2..e90c6a7333d7 100644 --- a/drivers/net/appletalk/ipddp.c +++ b/drivers/net/appletalk/ipddp.c @@ -74,7 +74,7 @@ static struct net_device * __init ipddp_init(void) if (!dev) return ERR_PTR(-ENOMEM); - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); strcpy(dev->name, "ipddp%d"); if (version_printed++ == 0) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 3ad5413d4f57..c9ac06cfe6b7 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1002,7 +1002,8 @@ static netdev_features_t bond_fix_features(struct net_device *dev, static void bond_compute_features(struct bonding *bond) { - unsigned int flags, dst_release_flag = IFF_XMIT_DST_RELEASE; + unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | + IFF_XMIT_DST_RELEASE_PERM; netdev_features_t vlan_features = BOND_VLAN_FEATURES; netdev_features_t enc_features = BOND_ENC_FEATURES; struct net_device *bond_dev = bond->dev; @@ -1038,8 +1039,10 @@ done: bond_dev->gso_max_segs = gso_max_segs; netif_set_gso_max_size(bond_dev, gso_max_size); - flags = bond_dev->priv_flags & ~IFF_XMIT_DST_RELEASE; - bond_dev->priv_flags = flags | dst_release_flag; + bond_dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + if ((bond_dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) && + dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM)) + bond_dev->priv_flags |= IFF_XMIT_DST_RELEASE; netdev_change_features(bond_dev); } diff --git a/drivers/net/eql.c b/drivers/net/eql.c index 957e5c0cede3..a10ad74cc8d2 100644 --- a/drivers/net/eql.c +++ b/drivers/net/eql.c @@ -199,7 +199,7 @@ static void __init eql_setup(struct net_device *dev) dev->type = ARPHRD_SLIP; dev->tx_queue_len = 5; /* Hands them off fast */ - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } static int eql_open(struct net_device *dev) diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c index d2d4a3d2237f..34f846b4bd05 100644 --- a/drivers/net/ifb.c +++ b/drivers/net/ifb.c @@ -185,7 +185,8 @@ static void ifb_setup(struct net_device *dev) dev->flags |= IFF_NOARP; dev->flags &= ~IFF_MULTICAST; - dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + netif_keep_dst(dev); eth_hw_addr_random(dev); } diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index 8f2262540561..c76283c2f84a 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -169,7 +169,7 @@ static void loopback_setup(struct net_device *dev) dev->type = ARPHRD_LOOPBACK; /* 0x0001*/ dev->flags = IFF_LOOPBACK; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); dev->hw_features = NETIF_F_ALL_TSO | NETIF_F_UFO; dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_ALL_TSO diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index e8a453f1b458..38b4fae61f04 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1025,7 +1025,8 @@ void macvlan_common_setup(struct net_device *dev) { ether_setup(dev); - dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + netif_keep_dst(dev); dev->priv_flags |= IFF_UNICAST_FLT; dev->netdev_ops = &macvlan_netdev_ops; dev->destructor = free_netdev; diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index fa0d71727894..80e6f3430f65 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1103,7 +1103,7 @@ static void ppp_setup(struct net_device *dev) dev->type = ARPHRD_PPP; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; dev->features |= NETIF_F_NETNS_LOCAL; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } /* diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 2277c3679a51..a94a9df3e6bd 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -970,7 +970,8 @@ static void __team_compute_features(struct team *team) struct team_port *port; u32 vlan_features = TEAM_VLAN_FEATURES & NETIF_F_ALL_FOR_ALL; unsigned short max_hard_header_len = ETH_HLEN; - unsigned int flags, dst_release_flag = IFF_XMIT_DST_RELEASE; + unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | + IFF_XMIT_DST_RELEASE_PERM; list_for_each_entry(port, &team->port_list, list) { vlan_features = netdev_increment_features(vlan_features, @@ -985,8 +986,9 @@ static void __team_compute_features(struct team *team) team->dev->vlan_features = vlan_features; team->dev->hard_header_len = max_hard_header_len; - flags = team->dev->priv_flags & ~IFF_XMIT_DST_RELEASE; - team->dev->priv_flags = flags | dst_release_flag; + team->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + if (dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM)) + team->dev->priv_flags |= IFF_XMIT_DST_RELEASE; netdev_change_features(team->dev); } diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 2af795d6ba05..2a51e6e48e1e 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2193,7 +2193,7 @@ static void vxlan_setup(struct net_device *dev) dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; INIT_LIST_HEAD(&vxlan->next); diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c index e5c7e6165a4b..3ebed1c40abb 100644 --- a/drivers/net/wan/hdlc_fr.c +++ b/drivers/net/wan/hdlc_fr.c @@ -1047,7 +1047,7 @@ static void pvc_setup(struct net_device *dev) dev->flags = IFF_POINTOPOINT; dev->hard_header_len = 10; dev->addr_len = 2; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } static const struct net_device_ops pvc_ops = { diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index f8427a2c4840..afebb9709763 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -3306,7 +3306,7 @@ static int qeth_l3_setup_netdev(struct qeth_card *card) card->dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER; - card->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(card->dev); card->dev->gso_max_size = 15 * PAGE_SIZE; SET_NETDEV_DEV(card->dev, &card->gdev->dev); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2df86f50261c..3a4315b39d20 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1206,6 +1206,7 @@ enum netdev_priv_flags { IFF_SUPP_NOFCS = 1<<19, IFF_LIVE_ADDR_CHANGE = 1<<20, IFF_MACVLAN = 1<<21, + IFF_XMIT_DST_RELEASE_PERM = 1<<22, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1230,6 +1231,7 @@ enum netdev_priv_flags { #define IFF_SUPP_NOFCS IFF_SUPP_NOFCS #define IFF_LIVE_ADDR_CHANGE IFF_LIVE_ADDR_CHANGE #define IFF_MACVLAN IFF_MACVLAN +#define IFF_XMIT_DST_RELEASE_PERM IFF_XMIT_DST_RELEASE_PERM /** * struct net_device - The DEVICE structure. @@ -3588,6 +3590,12 @@ static inline bool netif_supports_nofcs(struct net_device *dev) return dev->priv_flags & IFF_SUPP_NOFCS; } +/* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */ +static inline void netif_keep_dst(struct net_device *dev) +{ + dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM); +} + extern struct pernet_operations __net_initdata loopback_net_ops; /* Logging, debugging and troubleshooting/diagnostic helpers. */ diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 35a6b6b15e8a..0d441ec8763e 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -799,7 +799,8 @@ void vlan_setup(struct net_device *dev) ether_setup(dev); dev->priv_flags |= IFF_802_1Q_VLAN; - dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + netif_keep_dst(dev); dev->tx_queue_len = 0; dev->netdev_ops = &vlan_netdev_ops; diff --git a/net/atm/clip.c b/net/atm/clip.c index 1d9eaa4f041a..17e55dfecbe2 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -501,7 +501,7 @@ static void clip_setup(struct net_device *dev) /* without any more elaborate queuing. 100 is a reasonable */ /* compromise between decent burst-tolerance and protection */ /* against memory hogs. */ - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } static int clip_create(int number) diff --git a/net/core/dev.c b/net/core/dev.c index a63b8c43c1b6..3c5bdaa44486 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2665,12 +2665,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device if (skb->next) return skb; - /* If device doesn't need skb->dst, release it right now while - * its hot in this cpu cache - */ - if (dev->priv_flags & IFF_XMIT_DST_RELEASE) - skb_dst_drop(skb); - features = netif_skb_features(skb); skb = validate_xmit_vlan(skb, features); if (unlikely(!skb)) @@ -2811,8 +2805,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ - if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) - skb_dst_force(skb); qdisc_bstats_update(q, skb); @@ -2827,7 +2819,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, rc = NET_XMIT_SUCCESS; } else { - skb_dst_force(skb); rc = q->enqueue(skb, q) & NET_XMIT_MASK; if (qdisc_run_begin(q)) { if (unlikely(contended)) { @@ -2924,6 +2915,14 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) skb_update_prio(skb); + /* If device/qdisc don't need skb->dst, release it right now while + * its hot in this cpu cache. + */ + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(skb); + else + skb_dst_force(skb); + txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); @@ -6674,7 +6673,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->adj_list.lower); INIT_LIST_HEAD(&dev->all_adj_list.upper); INIT_LIST_HEAD(&dev->all_adj_list.lower); - dev->priv_flags = IFF_XMIT_DST_RELEASE; + dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); dev->num_tx_queues = txqs; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0485ef18d254..12055fdbe716 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -510,7 +510,7 @@ static int ipgre_tunnel_init(struct net_device *dev) memcpy(dev->broadcast, &iph->daddr, 4); dev->flags = IFF_NOARP; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); dev->addr_len = 4; if (iph->daddr) { diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index e453cb724a95..3e861011e4a3 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -364,7 +364,7 @@ static int vti_tunnel_init(struct net_device *dev) dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index ea88ab3102a8..37096d64730e 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -289,7 +289,7 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); dev->features |= IPIP_FEATURES; dev->hw_features |= IPIP_FEATURES; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 74b677916a70..de3b1c86b8d3 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1242,7 +1242,7 @@ static void ip6gre_tunnel_setup(struct net_device *dev) dev->flags |= IFF_NOARP; dev->iflink = 0; dev->addr_len = sizeof(struct in6_addr); - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } static int ip6gre_tunnel_init(struct net_device *dev) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index d3e8888ad611..9409887fb664 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1493,7 +1493,7 @@ static void ip6_tnl_dev_setup(struct net_device *dev) dev->mtu -= 8; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); /* This perm addr will be used as interface identifier by IPv6 */ dev->addr_assign_type = NET_ADDR_RANDOM; eth_random_addr(dev->perm_addr); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 5833a2244467..d440bb585524 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -807,7 +807,7 @@ static void vti6_dev_setup(struct net_device *dev) dev->mtu = ETH_DATA_LEN; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } /** diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 0d4e27466f82..6eab37cf5345 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1364,7 +1364,7 @@ static void ipip6_tunnel_setup(struct net_device *dev) dev->hard_header_len = LL_MAX_HEADER + t_hlen; dev->mtu = ETH_DATA_LEN - t_hlen; dev->flags = IFF_NOARP; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index a5d2b20db560..4ac515f2a6ce 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -493,6 +493,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, tcf_exts_change(tp, &fnew->exts, &e); tcf_em_tree_change(tp, &fnew->ematches, &t); + netif_keep_dst(qdisc_dev(tp->q)); + if (tb[TCA_FLOW_KEYS]) { fnew->keymask = keymask; fnew->nkeys = nkeys; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 6f22baae0afa..109a329b7198 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -524,6 +524,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, if (f->handle < f1->handle) break; + netif_keep_dst(qdisc_dev(tp->q)); rcu_assign_pointer(f->next, f1); rcu_assign_pointer(*fp, f); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 2b349a4de3c8..38d58e6cef07 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -47,7 +47,6 @@ EXPORT_SYMBOL(default_qdisc_ops); static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) { - skb_dst_force(skb); q->gso_skb = skb; q->qstats.requeues++; q->q.qlen++; /* it's still part of the queue */ @@ -218,8 +217,6 @@ static inline int qdisc_restart(struct Qdisc *q) if (unlikely(!skb)) return 0; - WARN_ON_ONCE(skb_dst_is_noref(skb)); - root_lock = qdisc_lock(q); dev = qdisc_dev(q); txq = skb_get_tx_queue(dev, skb); diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 5cd291bd00e4..6ada42396a24 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -470,7 +470,7 @@ static __init void teql_master_setup(struct net_device *dev) dev->tx_queue_len = 100; dev->flags = IFF_NOARP; dev->hard_header_len = LL_MAX_HEADER; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } static LIST_HEAD(master_dev_list); -- cgit v1.2.3-59-g8ed1b From 583d4a6885cfa75a3d189f6bb69b5c545e961c75 Mon Sep 17 00:00:00 2001 From: LEROY Christophe Date: Tue, 7 Oct 2014 15:04:57 +0200 Subject: net: fs_enet: Remove non NAPI RX In the probe function, use_napi is inconditionnaly set to 1. This patch removes all the code which is conditional to !use_napi, and removes use_napi which has then become useless. Signed-off-by: Christophe Leroy Signed-off-by: David S. Miller --- .../net/ethernet/freescale/fs_enet/fs_enet-main.c | 164 ++------------------- include/linux/fs_enet_pd.h | 1 - 2 files changed, 15 insertions(+), 150 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c index 748fd24d3d9e..71a25b4e2d5a 100644 --- a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c +++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c @@ -215,128 +215,6 @@ static int fs_enet_rx_napi(struct napi_struct *napi, int budget) return received; } -/* non NAPI receive function */ -static int fs_enet_rx_non_napi(struct net_device *dev) -{ - struct fs_enet_private *fep = netdev_priv(dev); - const struct fs_platform_info *fpi = fep->fpi; - cbd_t __iomem *bdp; - struct sk_buff *skb, *skbn, *skbt; - int received = 0; - u16 pkt_len, sc; - int curidx; - /* - * First, grab all of the stats for the incoming packet. - * These get messed up if we get called due to a busy condition. - */ - bdp = fep->cur_rx; - - while (((sc = CBDR_SC(bdp)) & BD_ENET_RX_EMPTY) == 0) { - - curidx = bdp - fep->rx_bd_base; - - /* - * Since we have allocated space to hold a complete frame, - * the last indicator should be set. - */ - if ((sc & BD_ENET_RX_LAST) == 0) - dev_warn(fep->dev, "rcv is not +last\n"); - - /* - * Check for errors. - */ - if (sc & (BD_ENET_RX_LG | BD_ENET_RX_SH | BD_ENET_RX_CL | - BD_ENET_RX_NO | BD_ENET_RX_CR | BD_ENET_RX_OV)) { - fep->stats.rx_errors++; - /* Frame too long or too short. */ - if (sc & (BD_ENET_RX_LG | BD_ENET_RX_SH)) - fep->stats.rx_length_errors++; - /* Frame alignment */ - if (sc & (BD_ENET_RX_NO | BD_ENET_RX_CL)) - fep->stats.rx_frame_errors++; - /* CRC Error */ - if (sc & BD_ENET_RX_CR) - fep->stats.rx_crc_errors++; - /* FIFO overrun */ - if (sc & BD_ENET_RX_OV) - fep->stats.rx_crc_errors++; - - skb = fep->rx_skbuff[curidx]; - - dma_unmap_single(fep->dev, CBDR_BUFADDR(bdp), - L1_CACHE_ALIGN(PKT_MAXBUF_SIZE), - DMA_FROM_DEVICE); - - skbn = skb; - - } else { - - skb = fep->rx_skbuff[curidx]; - - dma_unmap_single(fep->dev, CBDR_BUFADDR(bdp), - L1_CACHE_ALIGN(PKT_MAXBUF_SIZE), - DMA_FROM_DEVICE); - - /* - * Process the incoming frame. - */ - fep->stats.rx_packets++; - pkt_len = CBDR_DATLEN(bdp) - 4; /* remove CRC */ - fep->stats.rx_bytes += pkt_len + 4; - - if (pkt_len <= fpi->rx_copybreak) { - /* +2 to make IP header L1 cache aligned */ - skbn = netdev_alloc_skb(dev, pkt_len + 2); - if (skbn != NULL) { - skb_reserve(skbn, 2); /* align IP header */ - skb_copy_from_linear_data(skb, - skbn->data, pkt_len); - /* swap */ - skbt = skb; - skb = skbn; - skbn = skbt; - } - } else { - skbn = netdev_alloc_skb(dev, ENET_RX_FRSIZE); - - if (skbn) - skb_align(skbn, ENET_RX_ALIGN); - } - - if (skbn != NULL) { - skb_put(skb, pkt_len); /* Make room */ - skb->protocol = eth_type_trans(skb, dev); - received++; - netif_rx(skb); - } else { - fep->stats.rx_dropped++; - skbn = skb; - } - } - - fep->rx_skbuff[curidx] = skbn; - CBDW_BUFADDR(bdp, dma_map_single(fep->dev, skbn->data, - L1_CACHE_ALIGN(PKT_MAXBUF_SIZE), - DMA_FROM_DEVICE)); - CBDW_DATLEN(bdp, 0); - CBDW_SC(bdp, (sc & ~BD_ENET_RX_STATS) | BD_ENET_RX_EMPTY); - - /* - * Update BD pointer to next entry. - */ - if ((sc & BD_ENET_RX_WRAP) == 0) - bdp++; - else - bdp = fep->rx_bd_base; - - (*fep->ops->rx_bd_done)(dev); - } - - fep->cur_rx = bdp; - - return 0; -} - static void fs_enet_tx(struct net_device *dev) { struct fs_enet_private *fep = netdev_priv(dev); @@ -453,8 +331,7 @@ fs_enet_interrupt(int irq, void *dev_id) nr++; int_clr_events = int_events; - if (fpi->use_napi) - int_clr_events &= ~fep->ev_napi_rx; + int_clr_events &= ~fep->ev_napi_rx; (*fep->ops->clear_int_events)(dev, int_clr_events); @@ -462,19 +339,15 @@ fs_enet_interrupt(int irq, void *dev_id) (*fep->ops->ev_error)(dev, int_events); if (int_events & fep->ev_rx) { - if (!fpi->use_napi) - fs_enet_rx_non_napi(dev); - else { - napi_ok = napi_schedule_prep(&fep->napi); - - (*fep->ops->napi_disable_rx)(dev); - (*fep->ops->clear_int_events)(dev, fep->ev_napi_rx); - - /* NOTE: it is possible for FCCs in NAPI mode */ - /* to submit a spurious interrupt while in poll */ - if (napi_ok) - __napi_schedule(&fep->napi); - } + napi_ok = napi_schedule_prep(&fep->napi); + + (*fep->ops->napi_disable_rx)(dev); + (*fep->ops->clear_int_events)(dev, fep->ev_napi_rx); + + /* NOTE: it is possible for FCCs in NAPI mode */ + /* to submit a spurious interrupt while in poll */ + if (napi_ok) + __napi_schedule(&fep->napi); } if (int_events & fep->ev_tx) @@ -811,24 +684,21 @@ static int fs_enet_open(struct net_device *dev) /* not doing this, will cause a crash in fs_enet_rx_napi */ fs_init_bds(fep->ndev); - if (fep->fpi->use_napi) - napi_enable(&fep->napi); + napi_enable(&fep->napi); /* Install our interrupt handler. */ r = request_irq(fep->interrupt, fs_enet_interrupt, IRQF_SHARED, "fs_enet-mac", dev); if (r != 0) { dev_err(fep->dev, "Could not allocate FS_ENET IRQ!"); - if (fep->fpi->use_napi) - napi_disable(&fep->napi); + napi_disable(&fep->napi); return -EINVAL; } err = fs_init_phy(dev); if (err) { free_irq(fep->interrupt, dev); - if (fep->fpi->use_napi) - napi_disable(&fep->napi); + napi_disable(&fep->napi); return err; } phy_start(fep->phydev); @@ -845,8 +715,7 @@ static int fs_enet_close(struct net_device *dev) netif_stop_queue(dev); netif_carrier_off(dev); - if (fep->fpi->use_napi) - napi_disable(&fep->napi); + napi_disable(&fep->napi); phy_stop(fep->phydev); spin_lock_irqsave(&fep->lock, flags); @@ -1022,7 +891,6 @@ static int fs_enet_probe(struct platform_device *ofdev) fpi->rx_ring = 32; fpi->tx_ring = 32; fpi->rx_copybreak = 240; - fpi->use_napi = 1; fpi->napi_weight = 17; fpi->phy_node = of_parse_phandle(ofdev->dev.of_node, "phy-handle", 0); if (!fpi->phy_node && of_phy_is_fixed_link(ofdev->dev.of_node)) { @@ -1102,9 +970,7 @@ static int fs_enet_probe(struct platform_device *ofdev) ndev->netdev_ops = &fs_enet_netdev_ops; ndev->watchdog_timeo = 2 * HZ; - if (fpi->use_napi) - netif_napi_add(ndev, &fep->napi, fs_enet_rx_napi, - fpi->napi_weight); + netif_napi_add(ndev, &fep->napi, fs_enet_rx_napi, fpi->napi_weight); ndev->ethtool_ops = &fs_ethtool_ops; diff --git a/include/linux/fs_enet_pd.h b/include/linux/fs_enet_pd.h index efb05961bdd8..77d783f71527 100644 --- a/include/linux/fs_enet_pd.h +++ b/include/linux/fs_enet_pd.h @@ -139,7 +139,6 @@ struct fs_platform_info { int rx_ring, tx_ring; /* number of buffers on rx */ __u8 macaddr[ETH_ALEN]; /* mac address */ int rx_copybreak; /* limit we copy small frames */ - int use_napi; /* use NAPI */ int napi_weight; /* NAPI weight */ int use_rmii; /* use RMII mode */ -- cgit v1.2.3-59-g8ed1b From 709c48b39ecf11a81f3820c13a828c330fd832b9 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Wed, 8 Oct 2014 23:53:39 +0900 Subject: net: description of dma_cookie cause make xmldocs warning In commit 7bced397510ab569d31de4c70b39e13355046387, dma_cookie was removed from struct skbuff. But the description of dma_cookie still exist. So the "make xmldocs" output following warning. Warning(.//include/linux/skbuff.h:609): Excess struct/union /enum/typedef member 'dma_cookie' description in 'sk_buff' Remove description of dma_cookie fix the symptom. Signed-off-by: Masanari Iida Acked-by: Dan Williams Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3a5ec7638627..776104ba02ce 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -483,8 +483,6 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1, * @wifi_acked_valid: wifi_acked was set * @wifi_acked: whether frame was acked on wifi or not * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS - * @dma_cookie: a cookie to one of several possible DMA operations - * done by skb DMA functions * @napi_id: id of the NAPI struct this skb came from * @secmark: security marking * @mark: Generic packet mark -- cgit v1.2.3-59-g8ed1b From 535114539bb2c081b6680cb5a34be17e7b45df37 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Oct 2014 08:19:27 -0700 Subject: net: add netdev_txq_bql_{enqueue, complete}_prefetchw() helpers Add two helpers so that drivers do not have to care of BQL being available or not. Signed-off-by: Eric Dumazet Reported-by: Jim Davis Fixes: 29d40c903247 ("net/mlx4_en: Use prefetch in tx path") Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 5 +++-- include/linux/netdevice.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 8726a4aee5a7..34c137878545 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -392,7 +392,8 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev, if (!priv->port_up) return true; - prefetchw(&ring->tx_queue->dql.limit); + netdev_txq_bql_complete_prefetchw(ring->tx_queue); + index = cons_index & size_mask; cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor; last_nr_txbb = ACCESS_ONCE(ring->last_nr_txbb); @@ -737,7 +738,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) vlan_tag = vlan_tx_tag_get(skb); - prefetchw(&ring->tx_queue->dql); + netdev_txq_bql_enqueue_prefetchw(ring->tx_queue); /* Track current inflight packets for performance analysis */ AVG_PERF_COUNTER(priv->pstats.inflight_avg, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3a4315b39d20..838407aea705 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -2480,6 +2481,34 @@ netif_xmit_frozen_or_drv_stopped(const struct netdev_queue *dev_queue) return dev_queue->state & QUEUE_STATE_DRV_XOFF_OR_FROZEN; } +/** + * netdev_txq_bql_enqueue_prefetchw - prefetch bql data for write + * @dev_queue: pointer to transmit queue + * + * BQL enabled drivers might use this helper in their ndo_start_xmit(), + * to give appropriate hint to the cpu. + */ +static inline void netdev_txq_bql_enqueue_prefetchw(struct netdev_queue *dev_queue) +{ +#ifdef CONFIG_BQL + prefetchw(&dev_queue->dql.num_queued); +#endif +} + +/** + * netdev_txq_bql_complete_prefetchw - prefetch bql data for write + * @dev_queue: pointer to transmit queue + * + * BQL enabled drivers might use this helper in their TX completion path, + * to give appropriate hint to the cpu. + */ +static inline void netdev_txq_bql_complete_prefetchw(struct netdev_queue *dev_queue) +{ +#ifdef CONFIG_BQL + prefetchw(&dev_queue->dql.limit); +#endif +} + static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue, unsigned int bytes) { -- cgit v1.2.3-59-g8ed1b