diff options
436 files changed, 12117 insertions, 3967 deletions
@@ -435,7 +435,7 @@ Martin Kepplinger <martink@posteo.de> <martin.kepplinger@ginzinger.com> Martin Kepplinger <martink@posteo.de> <martin.kepplinger@puri.sm> Martin Kepplinger <martink@posteo.de> <martin.kepplinger@theobroma-systems.com> Martyna Szapar-Mudlaw <martyna.szapar-mudlaw@linux.intel.com> <martyna.szapar-mudlaw@intel.com> -Mathieu Othacehe <m.othacehe@gmail.com> <othacehe@gnu.org> +Mathieu Othacehe <othacehe@gnu.org> <m.othacehe@gmail.com> Mat Martineau <martineau@kernel.org> <mathew.j.martineau@linux.intel.com> Mat Martineau <martineau@kernel.org> <mathewm@codeaurora.org> Matthew Wilcox <willy@infradead.org> <matthew.r.wilcox@intel.com> @@ -529,6 +529,8 @@ Oleksij Rempel <linux@rempel-privat.de> <external.Oleksij.Rempel@de.bosch.com> Oleksij Rempel <linux@rempel-privat.de> <fixed-term.Oleksij.Rempel@de.bosch.com> Oleksij Rempel <o.rempel@pengutronix.de> Oleksij Rempel <o.rempel@pengutronix.de> <ore@pengutronix.de> +Oliver Hartkopp <socketcan@hartkopp.net> <oliver.hartkopp@volkswagen.de> +Oliver Hartkopp <socketcan@hartkopp.net> <oliver@hartkopp.net> Oliver Upton <oliver.upton@linux.dev> <oupton@google.com> OndÅ™ej Jirman <megi@xff.cz> <megous@megous.com> Oza Pawandeep <quic_poza@quicinc.com> <poza@codeaurora.org> @@ -20,6 +20,10 @@ N: Thomas Abraham E: thomas.ab@samsung.com D: Samsung pin controller driver +N: Jose Abreu +E: jose.abreu@synopsys.com +D: Synopsys DesignWare XPCS MDIO/PCS driver. + N: Dragos Acostachioaie E: dragos@iname.com W: http://www.arbornet.org/~dragos @@ -1428,6 +1432,10 @@ S: 8124 Constitution Apt. 7 S: Sterling Heights, Michigan 48313 S: USA +N: Andy Gospodarek +E: andy@greyhouse.net +D: Maintenance and contributions to the network interface bonding driver. + N: Wolfgang Grandegger E: wg@grandegger.com D: Controller Area Network (device drivers) @@ -1812,6 +1820,10 @@ D: Author/maintainer of most DRM drivers (especially ATI, MGA) D: Core DRM templates, general DRM and 3D-related hacking S: No fixed address +N: Woojung Huh +E: woojung.huh@microchip.com +D: Microchip LAN78XX USB Ethernet driver + N: Kenn Humborg E: kenn@wombat.ie D: Mods to loop device to support sparse backing files diff --git a/Documentation/Makefile b/Documentation/Makefile index fa71602ec961..52c6c5a3efa9 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -104,7 +104,7 @@ quiet_cmd_sphinx = SPHINX $@ --> file://$(abspath $(BUILDDIR)/$3/$4) YNL_INDEX:=$(srctree)/Documentation/networking/netlink_spec/index.rst YNL_RST_DIR:=$(srctree)/Documentation/networking/netlink_spec YNL_YAML_DIR:=$(srctree)/Documentation/netlink/specs -YNL_TOOL:=$(srctree)/tools/net/ynl/ynl-gen-rst.py +YNL_TOOL:=$(srctree)/tools/net/ynl/pyynl/ynl_gen_rst.py YNL_RST_FILES_TMP := $(patsubst %.yaml,%.rst,$(wildcard $(YNL_YAML_DIR)/*.yaml)) YNL_RST_FILES := $(patsubst $(YNL_YAML_DIR)%,$(YNL_RST_DIR)%, $(YNL_RST_FILES_TMP)) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 5034915f4e8e..8872203df088 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -436,7 +436,7 @@ AnonHugePmdMapped). The number of file transparent huge pages mapped to userspace is available by reading ShmemPmdMapped and ShmemHugePages fields in ``/proc/meminfo``. To identify what applications are mapping file transparent huge pages, it -is necessary to read ``/proc/PID/smaps`` and count the FileHugeMapped fields +is necessary to read ``/proc/PID/smaps`` and count the FilePmdMapped fields for each mapping. Note that reading the smaps file is expensive and reading it diff --git a/Documentation/devicetree/bindings/net/amlogic,meson-dwmac.yaml b/Documentation/devicetree/bindings/net/amlogic,meson-dwmac.yaml index d1e2bca3c503..798a4c19f18c 100644 --- a/Documentation/devicetree/bindings/net/amlogic,meson-dwmac.yaml +++ b/Documentation/devicetree/bindings/net/amlogic,meson-dwmac.yaml @@ -166,11 +166,11 @@ unevaluatedProperties: false examples: - | ethmac: ethernet@c9410000 { - compatible = "amlogic,meson-gxbb-dwmac", "snps,dwmac"; - reg = <0xc9410000 0x10000>, <0xc8834540 0x8>; - interrupts = <8>; - interrupt-names = "macirq"; - clocks = <&clk_eth>, <&clk_fclk_div2>, <&clk_mpll2>, <&clk_fclk_div2>; - clock-names = "stmmaceth", "clkin0", "clkin1", "timing-adjustment"; - phy-mode = "rgmii"; + compatible = "amlogic,meson-gxbb-dwmac", "snps,dwmac"; + reg = <0xc9410000 0x10000>, <0xc8834540 0x8>; + interrupts = <8>; + interrupt-names = "macirq"; + clocks = <&clk_eth>, <&clk_fclk_div2>, <&clk_mpll2>, <&clk_fclk_div2>; + clock-names = "stmmaceth", "clkin0", "clkin1", "timing-adjustment"; + phy-mode = "rgmii"; }; diff --git a/Documentation/devicetree/bindings/net/asix,ax88178.yaml b/Documentation/devicetree/bindings/net/asix,ax88178.yaml index 768504ccbf74..03341b7438d5 100644 --- a/Documentation/devicetree/bindings/net/asix,ax88178.yaml +++ b/Documentation/devicetree/bindings/net/asix,ax88178.yaml @@ -63,8 +63,8 @@ examples: #size-cells = <0>; ethernet@1 { - compatible = "usbb95,772b"; - reg = <1>; + compatible = "usbb95,772b"; + reg = <1>; }; }; }; diff --git a/Documentation/devicetree/bindings/net/brcm,bcmgenet.yaml b/Documentation/devicetree/bindings/net/brcm,bcmgenet.yaml index 7c90a4390531..0e3fb4e42e3f 100644 --- a/Documentation/devicetree/bindings/net/brcm,bcmgenet.yaml +++ b/Documentation/devicetree/bindings/net/brcm,bcmgenet.yaml @@ -85,16 +85,16 @@ examples: #size-cells = <1>; mdio0: mdio@e14 { - compatible = "brcm,genet-mdio-v4"; - #address-cells = <1>; - #size-cells = <0>; - reg = <0xe14 0x8>; + compatible = "brcm,genet-mdio-v4"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0xe14 0x8>; - phy1: ethernet-phy@1 { + phy1: ethernet-phy@1 { max-speed = <1000>; reg = <1>; compatible = "ethernet-phy-ieee802.3-c22"; - }; + }; }; }; @@ -110,10 +110,10 @@ examples: interrupts = <0x0 0x16 0x0>, <0x0 0x17 0x0>; mdio1: mdio@e14 { - compatible = "brcm,genet-mdio-v4"; - #address-cells = <1>; - #size-cells = <0>; - reg = <0xe14 0x8>; + compatible = "brcm,genet-mdio-v4"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0xe14 0x8>; }; }; @@ -129,15 +129,15 @@ examples: interrupts = <0x0 0x18 0x0>, <0x0 0x19 0x0>; mdio2: mdio@e14 { - compatible = "brcm,genet-mdio-v4"; - #address-cells = <1>; - #size-cells = <0>; - reg = <0xe14 0x8>; + compatible = "brcm,genet-mdio-v4"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0xe14 0x8>; - phy0: ethernet-phy@0 { + phy0: ethernet-phy@0 { max-speed = <1000>; reg = <0>; compatible = "ethernet-phy-ieee802.3-c22"; - }; + }; }; }; diff --git a/Documentation/devicetree/bindings/net/brcm,mdio-mux-iproc.yaml b/Documentation/devicetree/bindings/net/brcm,mdio-mux-iproc.yaml index af96b4fd89d5..3f27746d9a56 100644 --- a/Documentation/devicetree/bindings/net/brcm,mdio-mux-iproc.yaml +++ b/Documentation/devicetree/bindings/net/brcm,mdio-mux-iproc.yaml @@ -38,43 +38,43 @@ unevaluatedProperties: false examples: - | - mdio_mux_iproc: mdio-mux@66020000 { + mdio-mux@66020000 { compatible = "brcm,mdio-mux-iproc"; reg = <0x66020000 0x250>; #address-cells = <1>; #size-cells = <0>; mdio@0 { - reg = <0x0>; - #address-cells = <1>; - #size-cells = <0>; - - pci_phy0: pci-phy@0 { - compatible = "brcm,ns2-pcie-phy"; - reg = <0x0>; - #phy-cells = <0>; - }; + reg = <0x0>; + #address-cells = <1>; + #size-cells = <0>; + + pci-phy@0 { + compatible = "brcm,ns2-pcie-phy"; + reg = <0x0>; + #phy-cells = <0>; + }; }; mdio@7 { - reg = <0x7>; - #address-cells = <1>; - #size-cells = <0>; - - pci_phy1: pci-phy@0 { - compatible = "brcm,ns2-pcie-phy"; - reg = <0x0>; - #phy-cells = <0>; - }; + reg = <0x7>; + #address-cells = <1>; + #size-cells = <0>; + + pci-phy@0 { + compatible = "brcm,ns2-pcie-phy"; + reg = <0x0>; + #phy-cells = <0>; + }; }; mdio@10 { - reg = <0x10>; - #address-cells = <1>; - #size-cells = <0>; + reg = <0x10>; + #address-cells = <1>; + #size-cells = <0>; - gphy0: eth-phy@10 { - reg = <0x10>; - }; + eth-phy@10 { + reg = <0x10>; + }; }; }; diff --git a/Documentation/devicetree/bindings/net/can/atmel,at91sam9263-can.yaml b/Documentation/devicetree/bindings/net/can/atmel,at91sam9263-can.yaml new file mode 100644 index 000000000000..c818c01a718b --- /dev/null +++ b/Documentation/devicetree/bindings/net/can/atmel,at91sam9263-can.yaml @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/can/atmel,at91sam9263-can.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Microchip AT91 CAN Controller + +maintainers: + - Nicolas Ferre <nicolas.ferre@microchip.com> + +allOf: + - $ref: can-controller.yaml# + +properties: + compatible: + oneOf: + - enum: + - atmel,at91sam9263-can + - atmel,at91sam9x5-can + - items: + - enum: + - microchip,sam9x60-can + - const: atmel,at91sam9x5-can + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + maxItems: 1 + + clock-names: + items: + - const: can_clk + +required: + - compatible + - reg + - interrupts + - clocks + - clock-names + +unevaluatedProperties: false + +examples: + - | + #include <dt-bindings/interrupt-controller/irq.h> + #include <dt-bindings/clock/at91.h> + can@f000c000 { + compatible = "atmel,at91sam9263-can"; + reg = <0xf000c000 0x300>; + interrupts = <30 IRQ_TYPE_LEVEL_HIGH 3>; + clocks = <&pmc PMC_TYPE_PERIPHERAL 12>; + clock-names = "can_clk"; + }; diff --git a/Documentation/devicetree/bindings/net/can/atmel-can.txt b/Documentation/devicetree/bindings/net/can/atmel-can.txt deleted file mode 100644 index 218a3b3eb27e..000000000000 --- a/Documentation/devicetree/bindings/net/can/atmel-can.txt +++ /dev/null @@ -1,15 +0,0 @@ -* AT91 CAN * - -Required properties: - - compatible: Should be "atmel,at91sam9263-can", "atmel,at91sam9x5-can" or - "microchip,sam9x60-can" - - reg: Should contain CAN controller registers location and length - - interrupts: Should contain IRQ line for the CAN controller - -Example: - - can0: can@f000c000 { - compatible = "atmel,at91sam9x5-can"; - reg = <0xf000c000 0x300>; - interrupts = <40 4 5> - }; diff --git a/Documentation/devicetree/bindings/net/can/bosch,c_can.yaml b/Documentation/devicetree/bindings/net/can/bosch,c_can.yaml index 4d7d67ee175a..ff1b59a0294e 100644 --- a/Documentation/devicetree/bindings/net/can/bosch,c_can.yaml +++ b/Documentation/devicetree/bindings/net/can/bosch,c_can.yaml @@ -99,11 +99,11 @@ examples: #include <dt-bindings/reset/altr,rst-mgr.h> can@ffc00000 { - compatible = "bosch,d_can"; - reg = <0xffc00000 0x1000>; - interrupts = <0 131 4>, <0 132 4>, <0 133 4>, <0 134 4>; - clocks = <&can0_clk>; - resets = <&rst CAN0_RESET>; + compatible = "bosch,d_can"; + reg = <0xffc00000 0x1000>; + interrupts = <0 131 4>, <0 132 4>, <0 133 4>, <0 134 4>; + clocks = <&can0_clk>; + resets = <&rst CAN0_RESET>; }; - | can@0 { diff --git a/Documentation/devicetree/bindings/net/can/microchip,mcp2510.yaml b/Documentation/devicetree/bindings/net/can/microchip,mcp2510.yaml index db446dde6842..e0ec53bc10c6 100644 --- a/Documentation/devicetree/bindings/net/can/microchip,mcp2510.yaml +++ b/Documentation/devicetree/bindings/net/can/microchip,mcp2510.yaml @@ -56,15 +56,15 @@ examples: #size-cells = <0>; can@1 { - compatible = "microchip,mcp2515"; - reg = <1>; - clocks = <&clk24m>; - interrupt-parent = <&gpio4>; - interrupts = <13 IRQ_TYPE_LEVEL_LOW>; - vdd-supply = <®5v0>; - xceiver-supply = <®5v0>; - gpio-controller; - #gpio-cells = <2>; + compatible = "microchip,mcp2515"; + reg = <1>; + clocks = <&clk24m>; + interrupt-parent = <&gpio4>; + interrupts = <13 IRQ_TYPE_LEVEL_LOW>; + vdd-supply = <®5v0>; + xceiver-supply = <®5v0>; + gpio-controller; + #gpio-cells = <2>; }; }; diff --git a/Documentation/devicetree/bindings/net/can/microchip,mpfs-can.yaml b/Documentation/devicetree/bindings/net/can/microchip,mpfs-can.yaml index 01e4d4a54df6..1219c5cb601f 100644 --- a/Documentation/devicetree/bindings/net/can/microchip,mpfs-can.yaml +++ b/Documentation/devicetree/bindings/net/can/microchip,mpfs-can.yaml @@ -15,7 +15,11 @@ allOf: properties: compatible: - const: microchip,mpfs-can + oneOf: + - items: + - const: microchip,pic64gx-can + - const: microchip,mpfs-can + - const: microchip,mpfs-can reg: maxItems: 1 diff --git a/Documentation/devicetree/bindings/net/can/st,stm32-bxcan.yaml b/Documentation/devicetree/bindings/net/can/st,stm32-bxcan.yaml index de1d4298893b..c7510b00954a 100644 --- a/Documentation/devicetree/bindings/net/can/st,stm32-bxcan.yaml +++ b/Documentation/devicetree/bindings/net/can/st,stm32-bxcan.yaml @@ -63,7 +63,7 @@ properties: maxItems: 1 st,gcan: - $ref: /schemas/types.yaml#/definitions/phandle-array + $ref: /schemas/types.yaml#/definitions/phandle description: The phandle to the gcan node which allows to access the 512-bytes SRAM memory shared by the two bxCAN cells (CAN1 primary and CAN2 diff --git a/Documentation/devicetree/bindings/net/can/tcan4x5x.txt b/Documentation/devicetree/bindings/net/can/tcan4x5x.txt deleted file mode 100644 index 20c0572c9853..000000000000 --- a/Documentation/devicetree/bindings/net/can/tcan4x5x.txt +++ /dev/null @@ -1,48 +0,0 @@ -Texas Instruments TCAN4x5x CAN Controller -================================================ - -This file provides device node information for the TCAN4x5x interface contains. - -Required properties: - - compatible: - "ti,tcan4552", "ti,tcan4x5x" - "ti,tcan4553", "ti,tcan4x5x" or - "ti,tcan4x5x" - - reg: 0 - - #address-cells: 1 - - #size-cells: 0 - - spi-max-frequency: Maximum frequency of the SPI bus the chip can - operate at should be less than or equal to 18 MHz. - - interrupt-parent: the phandle to the interrupt controller which provides - the interrupt. - - interrupts: interrupt specification for data-ready. - -See Documentation/devicetree/bindings/net/can/bosch,m_can.yaml for additional -required property details. - -Optional properties: - - reset-gpios: Hardwired output GPIO. If not defined then software - reset. - - device-state-gpios: Input GPIO that indicates if the device is in - a sleep state or if the device is active. Not - available with tcan4552/4553. - - device-wake-gpios: Wake up GPIO to wake up the TCAN device. Not - available with tcan4552/4553. - - wakeup-source: Leave the chip running when suspended, and configure - the RX interrupt to wake up the device. - -Example: -tcan4x5x: tcan4x5x@0 { - compatible = "ti,tcan4x5x"; - reg = <0>; - #address-cells = <1>; - #size-cells = <1>; - spi-max-frequency = <10000000>; - bosch,mram-cfg = <0x0 0 0 16 0 0 1 1>; - interrupt-parent = <&gpio1>; - interrupts = <14 IRQ_TYPE_LEVEL_LOW>; - device-state-gpios = <&gpio3 21 GPIO_ACTIVE_HIGH>; - device-wake-gpios = <&gpio1 15 GPIO_ACTIVE_HIGH>; - reset-gpios = <&gpio1 27 GPIO_ACTIVE_HIGH>; - wakeup-source; -}; diff --git a/Documentation/devicetree/bindings/net/can/ti,tcan4x5x.yaml b/Documentation/devicetree/bindings/net/can/ti,tcan4x5x.yaml new file mode 100644 index 000000000000..384e15da2713 --- /dev/null +++ b/Documentation/devicetree/bindings/net/can/ti,tcan4x5x.yaml @@ -0,0 +1,199 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/can/ti,tcan4x5x.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Texas Instruments TCAN4x5x CAN Controller + +maintainers: + - Marc Kleine-Budde <mkl@pengutronix.de> + +properties: + compatible: + oneOf: + - items: + - enum: + - ti,tcan4552 + - ti,tcan4553 + - const: ti,tcan4x5x + - const: ti,tcan4x5x + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + description: The GPIO parent interrupt. + + clocks: + maxItems: 1 + + clock-names: + items: + - const: cclk + + reset-gpios: + description: Hardwired output GPIO. If not defined then software reset. + maxItems: 1 + + device-state-gpios: + description: + Input GPIO that indicates if the device is in a sleep state or if the + device is active. Not available with tcan4552/4553. + maxItems: 1 + + device-wake-gpios: + description: + Wake up GPIO to wake up the TCAN device. + Not available with tcan4552/4553. + maxItems: 1 + + bosch,mram-cfg: + description: | + Message RAM configuration data. + Multiple M_CAN instances can share the same Message RAM + and each element(e.g Rx FIFO or Tx Buffer and etc) number + in Message RAM is also configurable, so this property is + telling driver how the shared or private Message RAM are + used by this M_CAN controller. + + The format should be as follows: + <offset sidf_elems xidf_elems rxf0_elems rxf1_elems rxb_elems txe_elems txb_elems> + The 'offset' is an address offset of the Message RAM where + the following elements start from. This is usually set to + 0x0 if you're using a private Message RAM. The remain cells + are used to specify how many elements are used for each FIFO/Buffer. + + M_CAN includes the following elements according to user manual: + 11-bit Filter 0-128 elements / 0-128 words + 29-bit Filter 0-64 elements / 0-128 words + Rx FIFO 0 0-64 elements / 0-1152 words + Rx FIFO 1 0-64 elements / 0-1152 words + Rx Buffers 0-64 elements / 0-1152 words + Tx Event FIFO 0-32 elements / 0-64 words + Tx Buffers 0-32 elements / 0-576 words + + Please refer to 2.4.1 Message RAM Configuration in Bosch + M_CAN user manual for details. + $ref: /schemas/types.yaml#/definitions/int32-array + items: + - description: The 'offset' is an address offset of the Message RAM where + the following elements start from. This is usually set to 0x0 if + you're using a private Message RAM. + default: 0 + - description: 11-bit Filter 0-128 elements / 0-128 words + minimum: 0 + maximum: 128 + - description: 29-bit Filter 0-64 elements / 0-128 words + minimum: 0 + maximum: 64 + - description: Rx FIFO 0 0-64 elements / 0-1152 words + minimum: 0 + maximum: 64 + - description: Rx FIFO 1 0-64 elements / 0-1152 words + minimum: 0 + maximum: 64 + - description: Rx Buffers 0-64 elements / 0-1152 words + minimum: 0 + maximum: 64 + - description: Tx Event FIFO 0-32 elements / 0-64 words + minimum: 0 + maximum: 32 + - description: Tx Buffers 0-32 elements / 0-576 words + minimum: 0 + maximum: 32 + minItems: 1 + + spi-max-frequency: + description: + Must be half or less of "clocks" frequency. + maximum: 18000000 + + ti,nwkrq-voltage-vio: + type: boolean + description: + nWKRQ Pin GPO buffer voltage configuration. + Set nWKRQ to use VIO voltage rail. + When not set nWKRQ will use internal voltage rail. + + wakeup-source: + $ref: /schemas/types.yaml#/definitions/flag + description: + Enable CAN remote wakeup. + +allOf: + - $ref: can-controller.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# + - if: + properties: + compatible: + contains: + enum: + - ti,tcan4552 + - ti,tcan4553 + then: + properties: + device-state-gpios: false + device-wake-gpios: false + +required: + - compatible + - reg + - interrupts + - clocks + - clock-names + - bosch,mram-cfg + +unevaluatedProperties: false + +examples: + - | + #include <dt-bindings/gpio/gpio.h> + #include <dt-bindings/interrupt-controller/irq.h> + + spi { + #address-cells = <1>; + #size-cells = <0>; + + can@0 { + compatible = "ti,tcan4x5x"; + reg = <0>; + clocks = <&can0_osc>; + clock-names = "cclk"; + pinctrl-names = "default"; + pinctrl-0 = <&can0_pins>; + spi-max-frequency = <10000000>; + bosch,mram-cfg = <0x0 0 0 16 0 0 1 1>; + interrupt-parent = <&gpio1>; + interrupts = <14 IRQ_TYPE_LEVEL_LOW>; + device-state-gpios = <&gpio3 21 GPIO_ACTIVE_HIGH>; + device-wake-gpios = <&gpio1 15 GPIO_ACTIVE_HIGH>; + reset-gpios = <&gpio1 27 GPIO_ACTIVE_HIGH>; + ti,nwkrq-voltage-vio; + wakeup-source; + }; + }; + - | + #include <dt-bindings/gpio/gpio.h> + #include <dt-bindings/interrupt-controller/irq.h> + + spi { + #address-cells = <1>; + #size-cells = <0>; + + can@0 { + compatible = "ti,tcan4552", "ti,tcan4x5x"; + reg = <0>; + clocks = <&can0_osc>; + clock-names = "cclk"; + pinctrl-names = "default"; + pinctrl-0 = <&can0_pins>; + spi-max-frequency = <10000000>; + bosch,mram-cfg = <0x0 0 0 16 0 0 1 1>; + interrupt-parent = <&gpio1>; + interrupts = <14 IRQ_TYPE_LEVEL_LOW>; + reset-gpios = <&gpio1 27 GPIO_ACTIVE_HIGH>; + wakeup-source; + }; + }; diff --git a/Documentation/devicetree/bindings/net/pse-pd/pse-controller.yaml b/Documentation/devicetree/bindings/net/pse-pd/pse-controller.yaml index a12cda8aa764..cd09560e0aea 100644 --- a/Documentation/devicetree/bindings/net/pse-pd/pse-controller.yaml +++ b/Documentation/devicetree/bindings/net/pse-pd/pse-controller.yaml @@ -81,7 +81,7 @@ properties: List of phandles, each pointing to the power supply for the corresponding pairset named in 'pairset-names'. This property aligns with IEEE 802.3-2022, Section 33.2.3 and 145.2.4. - PSE Pinout Alternatives (as per IEEE 802.3-2022 Table 145\u20133) + PSE Pinout Alternatives (as per IEEE 802.3-2022 Table 145-3) |-----------|---------------|---------------|---------------|---------------| | Conductor | Alternative A | Alternative A | Alternative B | Alternative B | | | (MDI-X) | (MDI) | (X) | (S) | diff --git a/Documentation/devicetree/bindings/net/qcom,ipa.yaml b/Documentation/devicetree/bindings/net/qcom,ipa.yaml index 53cae71d9957..1a46d80a66e8 100644 --- a/Documentation/devicetree/bindings/net/qcom,ipa.yaml +++ b/Documentation/devicetree/bindings/net/qcom,ipa.yaml @@ -239,7 +239,7 @@ examples: qcom,gsi-loader = "self"; memory-region = <&ipa_fw_mem>; - firmware-name = "qcom/sc7180-trogdor/modem/modem.mdt"; + firmware-name = "qcom/sc7180-trogdor/modem/modem.mbn"; iommus = <&apps_smmu 0x440 0x0>, <&apps_smmu 0x442 0x0>; diff --git a/Documentation/devicetree/bindings/net/stm32-dwmac.yaml b/Documentation/devicetree/bindings/net/stm32-dwmac.yaml index bf23838fe6e8..85cea9966a27 100644 --- a/Documentation/devicetree/bindings/net/stm32-dwmac.yaml +++ b/Documentation/devicetree/bindings/net/stm32-dwmac.yaml @@ -154,56 +154,56 @@ examples: #include <dt-bindings/interrupt-controller/arm-gic.h> #include <dt-bindings/clock/stm32mp1-clks.h> //Example 1 - ethernet0: ethernet@5800a000 { - compatible = "st,stm32mp1-dwmac", "snps,dwmac-4.20a"; - reg = <0x5800a000 0x2000>; - reg-names = "stmmaceth"; - interrupts = <GIC_SPI 61 IRQ_TYPE_LEVEL_HIGH>; - interrupt-names = "macirq"; - clock-names = "stmmaceth", - "mac-clk-tx", - "mac-clk-rx", - "ethstp", - "eth-ck"; - clocks = <&rcc ETHMAC>, - <&rcc ETHTX>, - <&rcc ETHRX>, - <&rcc ETHSTP>, - <&rcc ETHCK_K>; - st,syscon = <&syscfg 0x4>; - snps,pbl = <2>; - snps,axi-config = <&stmmac_axi_config_0>; - snps,tso; - phy-mode = "rgmii"; - }; + ethernet0: ethernet@5800a000 { + compatible = "st,stm32mp1-dwmac", "snps,dwmac-4.20a"; + reg = <0x5800a000 0x2000>; + reg-names = "stmmaceth"; + interrupts = <GIC_SPI 61 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "macirq"; + clock-names = "stmmaceth", + "mac-clk-tx", + "mac-clk-rx", + "ethstp", + "eth-ck"; + clocks = <&rcc ETHMAC>, + <&rcc ETHTX>, + <&rcc ETHRX>, + <&rcc ETHSTP>, + <&rcc ETHCK_K>; + st,syscon = <&syscfg 0x4>; + snps,pbl = <2>; + snps,axi-config = <&stmmac_axi_config_0>; + snps,tso; + phy-mode = "rgmii"; + }; - | //Example 2 (MCU example) - ethernet1: ethernet@40028000 { - compatible = "st,stm32-dwmac", "snps,dwmac-3.50a"; - reg = <0x40028000 0x8000>; - reg-names = "stmmaceth"; - interrupts = <0 61 0>, <0 62 0>; - interrupt-names = "macirq", "eth_wake_irq"; - clock-names = "stmmaceth", "mac-clk-tx", "mac-clk-rx"; - clocks = <&rcc 0 25>, <&rcc 0 26>, <&rcc 0 27>; - st,syscon = <&syscfg 0x4>; - snps,pbl = <8>; - snps,mixed-burst; - phy-mode = "mii"; - }; + ethernet1: ethernet@40028000 { + compatible = "st,stm32-dwmac", "snps,dwmac-3.50a"; + reg = <0x40028000 0x8000>; + reg-names = "stmmaceth"; + interrupts = <0 61 0>, <0 62 0>; + interrupt-names = "macirq", "eth_wake_irq"; + clock-names = "stmmaceth", "mac-clk-tx", "mac-clk-rx"; + clocks = <&rcc 0 25>, <&rcc 0 26>, <&rcc 0 27>; + st,syscon = <&syscfg 0x4>; + snps,pbl = <8>; + snps,mixed-burst; + phy-mode = "mii"; + }; - | //Example 3 - ethernet2: ethernet@40027000 { - compatible = "st,stm32-dwmac", "snps,dwmac-4.10a"; - reg = <0x40028000 0x8000>; - reg-names = "stmmaceth"; - interrupts = <61>; - interrupt-names = "macirq"; - clock-names = "stmmaceth", "mac-clk-tx", "mac-clk-rx"; - clocks = <&rcc 62>, <&rcc 61>, <&rcc 60>; - st,syscon = <&syscfg 0x4>; - snps,pbl = <8>; - phy-mode = "mii"; - }; + ethernet2: ethernet@40027000 { + compatible = "st,stm32-dwmac", "snps,dwmac-4.10a"; + reg = <0x40028000 0x8000>; + reg-names = "stmmaceth"; + interrupts = <61>; + interrupt-names = "macirq"; + clock-names = "stmmaceth", "mac-clk-tx", "mac-clk-rx"; + clocks = <&rcc 62>, <&rcc 61>, <&rcc 60>; + st,syscon = <&syscfg 0x4>; + snps,pbl = <8>; + phy-mode = "mii"; + }; diff --git a/Documentation/devicetree/bindings/net/ti,davinci-mdio.yaml b/Documentation/devicetree/bindings/net/ti,davinci-mdio.yaml index 53604fab0b73..08119b6880ee 100644 --- a/Documentation/devicetree/bindings/net/ti,davinci-mdio.yaml +++ b/Documentation/devicetree/bindings/net/ti,davinci-mdio.yaml @@ -72,9 +72,9 @@ unevaluatedProperties: false examples: - | davinci_mdio: mdio@4a101000 { - compatible = "ti,davinci_mdio"; - #address-cells = <1>; - #size-cells = <0>; - reg = <0x4a101000 0x1000>; - bus_freq = <1000000>; + compatible = "ti,davinci_mdio"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0x4a101000 0x1000>; + bus_freq = <1000000>; }; diff --git a/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml b/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml index 02b6d32003cc..b11894fbaec4 100644 --- a/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml +++ b/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml @@ -302,16 +302,16 @@ examples: ti,fifo-depth = <DP83867_PHYCR_FIFO_DEPTH_4_B_NIB>; }; }; - }; - cpts@3d000 { - compatible = "ti,am65-cpts"; - reg = <0x0 0x3d000 0x0 0x400>; - clocks = <&k3_clks 18 2>; - clock-names = "cpts"; - interrupts-extended = <&gic500 GIC_SPI 858 IRQ_TYPE_LEVEL_HIGH>; - interrupt-names = "cpts"; - ti,cpts-ext-ts-inputs = <4>; - ti,cpts-periodic-outputs = <2>; + cpts@3d000 { + compatible = "ti,am65-cpts"; + reg = <0x0 0x3d000 0x0 0x400>; + clocks = <&k3_clks 18 2>; + clock-names = "cpts"; + interrupts-extended = <&gic500 GIC_SPI 858 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "cpts"; + ti,cpts-ext-ts-inputs = <4>; + ti,cpts-periodic-outputs = <2>; + }; }; }; diff --git a/Documentation/devicetree/bindings/net/ti,k3-am654-cpts.yaml b/Documentation/devicetree/bindings/net/ti,k3-am654-cpts.yaml index 3888692275ad..3572749147fb 100644 --- a/Documentation/devicetree/bindings/net/ti,k3-am654-cpts.yaml +++ b/Documentation/devicetree/bindings/net/ti,k3-am654-cpts.yaml @@ -131,23 +131,23 @@ examples: #include <dt-bindings/interrupt-controller/arm-gic.h> cpts@310d0000 { - compatible = "ti,am65-cpts"; - reg = <0x310d0000 0x400>; - reg-names = "cpts"; - clocks = <&main_cpts_mux>; - clock-names = "cpts"; - interrupts-extended = <&k3_irq 163 0 IRQ_TYPE_LEVEL_HIGH>; - interrupt-names = "cpts"; - ti,cpts-periodic-outputs = <6>; - ti,cpts-ext-ts-inputs = <8>; - - main_cpts_mux: refclk-mux { - #clock-cells = <0>; - clocks = <&k3_clks 118 5>, <&k3_clks 118 11>, - <&k3_clks 157 91>, <&k3_clks 157 77>, - <&k3_clks 157 102>, <&k3_clks 157 80>, - <&k3_clks 120 3>, <&k3_clks 121 3>; - assigned-clocks = <&main_cpts_mux>; - assigned-clock-parents = <&k3_clks 118 11>; - }; + compatible = "ti,am65-cpts"; + reg = <0x310d0000 0x400>; + reg-names = "cpts"; + clocks = <&main_cpts_mux>; + clock-names = "cpts"; + interrupts-extended = <&k3_irq 163 0 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "cpts"; + ti,cpts-periodic-outputs = <6>; + ti,cpts-ext-ts-inputs = <8>; + + main_cpts_mux: refclk-mux { + #clock-cells = <0>; + clocks = <&k3_clks 118 5>, <&k3_clks 118 11>, + <&k3_clks 157 91>, <&k3_clks 157 77>, + <&k3_clks 157 102>, <&k3_clks 157 80>, + <&k3_clks 120 3>, <&k3_clks 121 3>; + assigned-clocks = <&main_cpts_mux>; + assigned-clock-parents = <&k3_clks 118 11>; + }; }; diff --git a/Documentation/devicetree/bindings/net/wireless/marvell,sd8787.yaml b/Documentation/devicetree/bindings/net/wireless/marvell,sd8787.yaml index 1715b22e0dcf..930b700b73d0 100644 --- a/Documentation/devicetree/bindings/net/wireless/marvell,sd8787.yaml +++ b/Documentation/devicetree/bindings/net/wireless/marvell,sd8787.yaml @@ -79,15 +79,14 @@ examples: #include <dt-bindings/interrupt-controller/irq.h> mmc { - #address-cells = <1>; - #size-cells = <0>; - - wifi@1 { - compatible = "marvell,sd8897"; - reg = <1>; - interrupt-parent = <&pio>; - interrupts = <38 IRQ_TYPE_LEVEL_LOW>; - marvell,wakeup-pin = <3>; + #address-cells = <1>; + #size-cells = <0>; + + wifi@1 { + compatible = "marvell,sd8897"; + reg = <1>; + interrupt-parent = <&pio>; + interrupts = <38 IRQ_TYPE_LEVEL_LOW>; + marvell,wakeup-pin = <3>; }; }; - diff --git a/Documentation/netlink/specs/rt_link.yaml b/Documentation/netlink/specs/rt_link.yaml index 96465376d6fe..0d492500c7e5 100644 --- a/Documentation/netlink/specs/rt_link.yaml +++ b/Documentation/netlink/specs/rt_link.yaml @@ -1826,6 +1826,48 @@ attribute-sets: name: erspan-hwid type: u16 - + name: linkinfo-vti-attrs + name-prefix: ifla-vti- + attributes: + - + name: link + type: u32 + - + name: ikey + type: u32 + - + name: okey + type: u32 + - + name: local + type: binary + display-hint: ipv4 + - + name: remote + type: binary + display-hint: ipv4 + - + name: fwmark + type: u32 + - + name: linkinfo-vti6-attrs + subset-of: linkinfo-vti-attrs + attributes: + - + name: link + - + name: ikey + - + name: okey + - + name: local + display-hint: ipv6 + - + name: remote + display-hint: ipv6 + - + name: fwmark + - name: linkinfo-geneve-attrs name-prefix: ifla-geneve- attributes: @@ -1942,6 +1984,42 @@ attribute-sets: name: fwmark type: u32 - + name: linkinfo-ip6tnl-attrs + subset-of: linkinfo-iptun-attrs + attributes: + - + name: link + - + name: local + display-hint: ipv6 + - + name: remote + display-hint: ipv6 + - + name: ttl + - + name: encap-limit + - + name: flowinfo + - + name: flags + # ip6tnl unlike ipip and sit has 32b flags + type: u32 + - + name: proto + - + name: encap-type + - + name: encap-flags + - + name: encap-sport + - + name: encap-dport + - + name: collect-metadata + - + name: fwmark + - name: linkinfo-tun-attrs name-prefix: ifla-tun- attributes: @@ -2169,6 +2247,12 @@ attribute-sets: name: peer-scrub type: u32 enum: netkit-scrub + - + name: headroom + type: u16 + - + name: tailroom + type: u16 sub-messages: - @@ -2196,6 +2280,9 @@ sub-messages: value: ipip attribute-set: linkinfo-iptun-attrs - + value: ip6tnl + attribute-set: linkinfo-ip6tnl-attrs + - value: sit attribute-set: linkinfo-iptun-attrs - @@ -2208,6 +2295,12 @@ sub-messages: value: vrf attribute-set: linkinfo-vrf-attrs - + value: vti + attribute-set: linkinfo-vti-attrs + - + value: vti6 + attribute-set: linkinfo-vti6-attrs + - value: netkit attribute-set: linkinfo-netkit-attrs - diff --git a/Documentation/networking/device_drivers/ethernet/intel/i40e.rst b/Documentation/networking/device_drivers/ethernet/intel/i40e.rst index 4fbaa1a2d674..53d9d5829d69 100644 --- a/Documentation/networking/device_drivers/ethernet/intel/i40e.rst +++ b/Documentation/networking/device_drivers/ethernet/intel/i40e.rst @@ -299,6 +299,18 @@ Use ethtool to view and set link-down-on-close, as follows:: ethtool --show-priv-flags ethX ethtool --set-priv-flags ethX link-down-on-close [on|off] +Setting the mdd-auto-reset-vf Private Flag +------------------------------------------ + +When the mdd-auto-reset-vf private flag is set to "on", the problematic VF will +be automatically reset if a malformed descriptor is detected. If the flag is +set to "off", the problematic VF will be disabled. + +Use ethtool to view and set mdd-auto-reset-vf, as follows:: + + ethtool --show-priv-flags ethX + ethtool --set-priv-flags ethX mdd-auto-reset-vf [on|off] + Viewing Link Messages --------------------- Link messages will not be displayed to the console if the distribution is diff --git a/Documentation/networking/devlink/mlx5.rst b/Documentation/networking/devlink/mlx5.rst index 456985407475..41618538fc70 100644 --- a/Documentation/networking/devlink/mlx5.rst +++ b/Documentation/networking/devlink/mlx5.rst @@ -53,6 +53,9 @@ parameters. * ``smfs`` Software managed flow steering. In SMFS mode, the HW steering entities are created and manage through the driver without firmware intervention. + * ``hmfs`` Hardware managed flow steering. In HMFS mode, the driver + is configuring steering rules directly to the HW using Work Queues with + a special new type of WQE (Work Queue Element). SMFS mode is faster and provides better rule insertion rate compared to default DMFS mode. diff --git a/Documentation/networking/diagnostic/twisted_pair_layer1_diagnostics.rst b/Documentation/networking/diagnostic/twisted_pair_layer1_diagnostics.rst index c9be5cc7e113..079e17effadf 100644 --- a/Documentation/networking/diagnostic/twisted_pair_layer1_diagnostics.rst +++ b/Documentation/networking/diagnostic/twisted_pair_layer1_diagnostics.rst @@ -713,17 +713,23 @@ driver supports reporting such events. - **Monitor Error Counters**: - - While some NIC drivers and PHYs provide error counters, there is no unified - set of PHY-specific counters across all hardware. Additionally, not all - PHYs provide useful information related to errors like CRC errors, frame - drops, or link flaps. Therefore, this step is dependent on the specific - hardware and driver support. - - - **Next Steps**: Use `ethtool -S <interface>` to check if your driver - provides useful error counters. In some cases, counters may provide - information about errors like link flaps or physical layer problems (e.g., - excessive CRC errors), but results can vary significantly depending on the - PHY. + - Use `ethtool -S <interface> --all-groups` to retrieve standardized interface + statistics if the driver supports the unified interface: + + - **Command:** `ethtool -S <interface> --all-groups` + + - **Example Output (if supported)**: + + .. code-block:: bash + + phydev-RxFrames: 100391 + phydev-RxErrors: 0 + phydev-TxFrames: 9 + phydev-TxErrors: 0 + + - If the unified interface is not supported, use `ethtool -S <interface>` to + retrieve MAC and PHY counters. Note that non-standardized PHY counter names + vary by driver and must be interpreted accordingly: - **Command:** `ethtool -S <interface>` @@ -740,6 +746,17 @@ driver supports reporting such events. condition) or kernel log messages (e.g., link up/down events) to further diagnose the issue. + - **Compare Counters**: + + - Compare the egress and ingress frame counts reported by the PHY and MAC. + + - A small difference may occur due to sampling rate differences between the + MAC and PHY drivers, or if the PHY and MAC are not always fully + synchronized in their UP or DOWN states. + + - Significant discrepancies indicate potential issues in the data path + between the MAC and PHY. + When All Else Fails... ~~~~~~~~~~~~~~~~~~~~~~ diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index a7ba6368a4d5..da846f1d998e 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1616,6 +1616,7 @@ the ``ETHTOOL_A_STATS_GROUPS`` bitset. Currently defined values are: ETHTOOL_STATS_ETH_PHY eth-phy Basic IEEE 802.3 PHY statistics (30.3.2.1.*) ETHTOOL_STATS_ETH_CTRL eth-ctrl Basic IEEE 802.3 MAC Ctrl statistics (30.3.3.*) ETHTOOL_STATS_RMON rmon RMON (RFC 2819) statistics + ETHTOOL_STATS_PHY phy Additional PHY statistics, not defined by IEEE ====================== ======== =============================================== Each group should have a corresponding ``ETHTOOL_A_STATS_GRP`` in the reply. diff --git a/Documentation/networking/multi-pf-netdev.rst b/Documentation/networking/multi-pf-netdev.rst index 2cd25d81aaa7..2f5a5bb3ca9a 100644 --- a/Documentation/networking/multi-pf-netdev.rst +++ b/Documentation/networking/multi-pf-netdev.rst @@ -89,7 +89,7 @@ Observability ============= The relation between PF, irq, napi, and queue can be observed via netlink spec:: - $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml --dump queue-get --json='{"ifindex": 13}' + $ ./tools/net/ynl/pyynl/cli.py --spec Documentation/netlink/specs/netdev.yaml --dump queue-get --json='{"ifindex": 13}' [{'id': 0, 'ifindex': 13, 'napi-id': 539, 'type': 'rx'}, {'id': 1, 'ifindex': 13, 'napi-id': 540, 'type': 'rx'}, {'id': 2, 'ifindex': 13, 'napi-id': 541, 'type': 'rx'}, @@ -101,7 +101,7 @@ The relation between PF, irq, napi, and queue can be observed via netlink spec:: {'id': 3, 'ifindex': 13, 'napi-id': 542, 'type': 'tx'}, {'id': 4, 'ifindex': 13, 'napi-id': 543, 'type': 'tx'}] - $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml --dump napi-get --json='{"ifindex": 13}' + $ ./tools/net/ynl/pyynl/cli.py --spec Documentation/netlink/specs/netdev.yaml --dump napi-get --json='{"ifindex": 13}' [{'id': 543, 'ifindex': 13, 'irq': 42}, {'id': 542, 'ifindex': 13, 'irq': 41}, {'id': 541, 'ifindex': 13, 'irq': 40}, diff --git a/Documentation/networking/napi.rst b/Documentation/networking/napi.rst index 02720dd71a76..6083210ab2a4 100644 --- a/Documentation/networking/napi.rst +++ b/Documentation/networking/napi.rst @@ -199,13 +199,13 @@ parameters mentioned above use hyphens instead of underscores: Per-NAPI configuration can be done programmatically in a user application or by using a script included in the kernel source tree: -``tools/net/ynl/cli.py``. +``tools/net/ynl/pyynl/cli.py``. For example, using the script: .. code-block:: bash - $ kernel-source/tools/net/ynl/cli.py \ + $ kernel-source/tools/net/ynl/pyynl/cli.py \ --spec Documentation/netlink/specs/netdev.yaml \ --do napi-set \ --json='{"id": 345, diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst index 857c9784f87e..1d37038e9fbe 100644 --- a/Documentation/networking/netdevices.rst +++ b/Documentation/networking/netdevices.rst @@ -297,3 +297,13 @@ napi->poll: Context: softirq will be called with interrupts disabled by netconsole. + +NETDEV_INTERNAL symbol namespace +================================ + +Symbols exported as NETDEV_INTERNAL can only be used in networking +core and drivers which exclusively flow via the main networking list and trees. +Note that the inverse is not true, most symbols outside of NETDEV_INTERNAL +are not expected to be used by random code outside netdev either. +Symbols may lack the designation because they predate the namespaces, +or simply due to an oversight. diff --git a/Documentation/networking/netlink_spec/readme.txt b/Documentation/networking/netlink_spec/readme.txt index 6763f99d216c..030b44aca4e6 100644 --- a/Documentation/networking/netlink_spec/readme.txt +++ b/Documentation/networking/netlink_spec/readme.txt @@ -1,4 +1,4 @@ SPDX-License-Identifier: GPL-2.0 This file is populated during the build of the documentation (htmldocs) by the -tools/net/ynl/ynl-gen-rst.py script. +tools/net/ynl/pyynl/ynl_gen_rst.py script. diff --git a/Documentation/networking/xfrm_device.rst b/Documentation/networking/xfrm_device.rst index bfea9d8579ed..66f6e9a9b59a 100644 --- a/Documentation/networking/xfrm_device.rst +++ b/Documentation/networking/xfrm_device.rst @@ -169,7 +169,8 @@ the stack in xfrm_input(). hand the packet to napi_gro_receive() as usual -In ESN mode, xdo_dev_state_advance_esn() is called from xfrm_replay_advance_esn(). +In ESN mode, xdo_dev_state_advance_esn() is called from +xfrm_replay_advance_esn() for RX, and xfrm_replay_overflow_offload_esn for TX. Driver will check packet seq number and update HW ESN state machine if needed. Packet offload mode: diff --git a/Documentation/userspace-api/netlink/intro-specs.rst b/Documentation/userspace-api/netlink/intro-specs.rst index bada89699455..a4435ae4628d 100644 --- a/Documentation/userspace-api/netlink/intro-specs.rst +++ b/Documentation/userspace-api/netlink/intro-specs.rst @@ -15,7 +15,7 @@ developing Netlink related code. The tool is implemented in Python and can use a YAML specification to issue Netlink requests to the kernel. Only Generic Netlink is supported. -The tool is located at ``tools/net/ynl/cli.py``. It accepts +The tool is located at ``tools/net/ynl/pyynl/cli.py``. It accepts a handul of arguments, the most important ones are: - ``--spec`` - point to the spec file @@ -27,7 +27,7 @@ YAML specs can be found under ``Documentation/netlink/specs/``. Example use:: - $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/ethtool.yaml \ + $ ./tools/net/ynl/pyynl/cli.py --spec Documentation/netlink/specs/ethtool.yaml \ --do rings-get \ --json '{"header":{"dev-index": 18}}' {'header': {'dev-index': 18, 'dev-name': 'eni1np1'}, @@ -75,7 +75,7 @@ the two marker lines like above to a file, add that file to git, and run the regeneration tool. Grep the tree for ``YNL-GEN`` to see other examples. -The code generation itself is performed by ``tools/net/ynl/ynl-gen-c.py`` +The code generation itself is performed by ``tools/net/ynl/pyynl/ynl_gen_c.py`` but it takes a few arguments so calling it directly for each file quickly becomes tedious. @@ -84,7 +84,7 @@ YNL lib ``tools/net/ynl/lib/`` contains an implementation of a C library (based on libmnl) which integrates with code generated by -``tools/net/ynl/ynl-gen-c.py`` to create easy to use netlink wrappers. +``tools/net/ynl/pyynl/ynl_gen_c.py`` to create easy to use netlink wrappers. YNL basics ---------- diff --git a/MAINTAINERS b/MAINTAINERS index a685c551faf0..797e94ba74f7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -949,7 +949,6 @@ AMAZON ETHERNET DRIVERS M: Shay Agroskin <shayagr@amazon.com> M: Arthur Kiyanovski <akiyano@amazon.com> R: David Arinzon <darinzon@amazon.com> -R: Noam Dagan <ndagan@amazon.com> R: Saeed Bishara <saeedb@amazon.com> L: netdev@vger.kernel.org S: Supported @@ -2690,7 +2689,6 @@ N: at91 N: atmel ARM/Microchip Sparx5 SoC support -M: Lars Povlsen <lars.povlsen@microchip.com> M: Steen Hegelund <Steen.Hegelund@microchip.com> M: Daniel Machon <daniel.machon@microchip.com> M: UNGLinuxDriver@microchip.com @@ -4065,7 +4063,6 @@ F: net/bluetooth/ BONDING DRIVER M: Jay Vosburgh <jv@jvosburgh.net> -M: Andy Gospodarek <andy@greyhouse.net> L: netdev@vger.kernel.org S: Maintained F: Documentation/networking/bonding.rst @@ -5118,6 +5115,7 @@ F: include/uapi/linux/can/gw.h F: include/uapi/linux/can/isotp.h F: include/uapi/linux/can/raw.h F: net/can/ +F: net/sched/em_canid.c CAN-J1939 NETWORK LAYER M: Robin van der Gracht <robin@protonic.nl> @@ -14574,7 +14572,6 @@ F: drivers/dma/mediatek/ MEDIATEK ETHERNET DRIVER M: Felix Fietkau <nbd@nbd.name> M: Sean Wang <sean.wang@mediatek.com> -M: Mark Lee <Mark-MC.Lee@mediatek.com> M: Lorenzo Bianconi <lorenzo@kernel.org> L: netdev@vger.kernel.org S: Maintained @@ -14764,7 +14761,7 @@ F: drivers/memory/mtk-smi.c F: include/soc/mediatek/smi.h MEDIATEK SWITCH DRIVER -M: Arınç ÜNAL <arinc.unal@arinc9.com> +M: Chester A. Unal <chester.a.unal@arinc9.com> M: Daniel Golle <daniel@makrotopia.org> M: DENG Qingfang <dqfext@gmail.com> M: Sean Wang <sean.wang@mediatek.com> @@ -16174,7 +16171,8 @@ M: Breno Leitao <leitao@debian.org> S: Maintained F: Documentation/networking/netconsole.rst F: drivers/net/netconsole.c -F: tools/testing/selftests/drivers/net/netcons_basic.sh +F: tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh +F: tools/testing/selftests/drivers/net/netcons\* NETDEVSIM M: Jakub Kicinski <kuba@kernel.org> @@ -18469,7 +18467,7 @@ F: Documentation/devicetree/bindings/pinctrl/mediatek,mt8183-pinctrl.yaml F: drivers/pinctrl/mediatek/ PIN CONTROLLER - MEDIATEK MIPS -M: Arınç ÜNAL <arinc.unal@arinc9.com> +M: Chester A. Unal <chester.a.unal@arinc9.com> M: Sergio Paracuellos <sergio.paracuellos@gmail.com> L: linux-mediatek@lists.infradead.org (moderated for non-subscribers) L: linux-mips@vger.kernel.org @@ -19513,7 +19511,7 @@ S: Maintained F: arch/mips/ralink RALINK MT7621 MIPS ARCHITECTURE -M: Arınç ÜNAL <arinc.unal@arinc9.com> +M: Chester A. Unal <chester.a.unal@arinc9.com> M: Sergio Paracuellos <sergio.paracuellos@gmail.com> L: linux-mips@vger.kernel.org S: Maintained @@ -20916,6 +20914,8 @@ F: kernel/sched/ SCHEDULER - SCHED_EXT R: Tejun Heo <tj@kernel.org> R: David Vernet <void@manifault.com> +R: Andrea Righi <arighi@nvidia.com> +R: Changwoo Min <changwoo@igalia.com> L: linux-kernel@vger.kernel.org S: Maintained W: https://github.com/sched-ext/scx @@ -22510,11 +22510,8 @@ F: Documentation/devicetree/bindings/phy/st,stm32mp25-combophy.yaml F: drivers/phy/st/phy-stm32-combophy.c STMMAC ETHERNET DRIVER -M: Alexandre Torgue <alexandre.torgue@foss.st.com> -M: Jose Abreu <joabreu@synopsys.com> L: netdev@vger.kernel.org -S: Supported -W: http://www.stlinux.com +S: Orphan F: Documentation/networking/device_drivers/ethernet/stmicro/ F: drivers/net/ethernet/stmicro/stmmac/ @@ -22746,9 +22743,8 @@ S: Supported F: drivers/net/ethernet/synopsys/ SYNOPSYS DESIGNWARE ETHERNET XPCS DRIVER -M: Jose Abreu <Jose.Abreu@synopsys.com> L: netdev@vger.kernel.org -S: Supported +S: Orphan F: drivers/net/pcs/pcs-xpcs.c F: drivers/net/pcs/pcs-xpcs.h F: include/linux/pcs/pcs-xpcs.h @@ -23656,7 +23652,6 @@ F: tools/testing/selftests/timers/ TIPC NETWORK LAYER M: Jon Maloy <jmaloy@redhat.com> -M: Ying Xue <ying.xue@windriver.com> L: netdev@vger.kernel.org (core kernel code) L: tipc-discussion@lists.sourceforge.net (user apps, general discussion) S: Maintained @@ -24262,7 +24257,8 @@ F: Documentation/devicetree/bindings/usb/nxp,isp1760.yaml F: drivers/usb/isp1760/* USB LAN78XX ETHERNET DRIVER -M: Woojung Huh <woojung.huh@microchip.com> +M: Thangaraj Samynathan <Thangaraj.S@microchip.com> +M: Rengarajan Sundararajan <Rengarajan.S@microchip.com> M: UNGLinuxDriver@microchip.com L: netdev@vger.kernel.org S: Maintained @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 13 SUBLEVEL = 0 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Baby Opossum Posse # *DOCUMENTATION* diff --git a/drivers/bluetooth/btmtk.c b/drivers/bluetooth/btmtk.c index 7fd9d5ddce02..224eafc27dbe 100644 --- a/drivers/bluetooth/btmtk.c +++ b/drivers/bluetooth/btmtk.c @@ -1472,10 +1472,15 @@ EXPORT_SYMBOL_GPL(btmtk_usb_setup); int btmtk_usb_shutdown(struct hci_dev *hdev) { + struct btmtk_data *data = hci_get_priv(hdev); struct btmtk_hci_wmt_params wmt_params; u8 param = 0; int err; + err = usb_autopm_get_interface(data->intf); + if (err < 0) + return err; + /* Disable the device */ wmt_params.op = BTMTK_WMT_FUNC_CTRL; wmt_params.flag = 0; @@ -1486,9 +1491,11 @@ int btmtk_usb_shutdown(struct hci_dev *hdev) err = btmtk_usb_hci_wmt_sync(hdev, &wmt_params); if (err < 0) { bt_dev_err(hdev, "Failed to send wmt func ctrl (%d)", err); + usb_autopm_put_interface(data->intf); return err; } + usb_autopm_put_interface(data->intf); return 0; } EXPORT_SYMBOL_GPL(btmtk_usb_shutdown); diff --git a/drivers/bluetooth/btnxpuart.c b/drivers/bluetooth/btnxpuart.c index 569f5b7d6e46..1230045d78a5 100644 --- a/drivers/bluetooth/btnxpuart.c +++ b/drivers/bluetooth/btnxpuart.c @@ -1381,6 +1381,7 @@ static void btnxpuart_tx_work(struct work_struct *work) while ((skb = nxp_dequeue(nxpdev))) { len = serdev_device_write_buf(serdev, skb->data, skb->len); + serdev_device_wait_until_sent(serdev, 0); hdev->stat.byte_tx += len; skb_pull(skb, len); diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 6a99a459b80b..51745ed1bbab 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -1106,7 +1106,7 @@ int open_for_data(struct cdrom_device_info *cdi) } } - cd_dbg(CD_OPEN, "all seems well, opening the devicen"); + cd_dbg(CD_OPEN, "all seems well, opening the device\n"); /* all seems well, we can open the device */ ret = cdo->open(cdi, 0); /* open for data */ diff --git a/drivers/clk/imx/clk-imx8mp-audiomix.c b/drivers/clk/imx/clk-imx8mp-audiomix.c index b2cb157703c5..c409fc7e0618 100644 --- a/drivers/clk/imx/clk-imx8mp-audiomix.c +++ b/drivers/clk/imx/clk-imx8mp-audiomix.c @@ -278,7 +278,8 @@ static int clk_imx8mp_audiomix_reset_controller_register(struct device *dev, #else /* !CONFIG_RESET_CONTROLLER */ -static int clk_imx8mp_audiomix_reset_controller_register(struct clk_imx8mp_audiomix_priv *priv) +static int clk_imx8mp_audiomix_reset_controller_register(struct device *dev, + struct clk_imx8mp_audiomix_priv *priv) { return 0; } diff --git a/drivers/clk/thead/clk-th1520-ap.c b/drivers/clk/thead/clk-th1520-ap.c index 17e32ae08720..1015fab95251 100644 --- a/drivers/clk/thead/clk-th1520-ap.c +++ b/drivers/clk/thead/clk-th1520-ap.c @@ -779,6 +779,13 @@ static struct ccu_div dpu1_clk = { }, }; +static CLK_FIXED_FACTOR_HW(emmc_sdio_ref_clk, "emmc-sdio-ref", + &video_pll_clk.common.hw, 4, 1, 0); + +static const struct clk_parent_data emmc_sdio_ref_clk_pd[] = { + { .hw = &emmc_sdio_ref_clk.hw }, +}; + static CCU_GATE(CLK_BROM, brom_clk, "brom", ahb2_cpusys_hclk_pd, 0x100, BIT(4), 0); static CCU_GATE(CLK_BMU, bmu_clk, "bmu", axi4_cpusys2_aclk_pd, 0x100, BIT(5), 0); static CCU_GATE(CLK_AON2CPU_A2X, aon2cpu_a2x_clk, "aon2cpu-a2x", axi4_cpusys2_aclk_pd, @@ -798,7 +805,7 @@ static CCU_GATE(CLK_PERISYS_APB4_HCLK, perisys_apb4_hclk, "perisys-apb4-hclk", p 0x150, BIT(12), 0); static CCU_GATE(CLK_NPU_AXI, npu_axi_clk, "npu-axi", axi_aclk_pd, 0x1c8, BIT(5), 0); static CCU_GATE(CLK_CPU2VP, cpu2vp_clk, "cpu2vp", axi_aclk_pd, 0x1e0, BIT(13), 0); -static CCU_GATE(CLK_EMMC_SDIO, emmc_sdio_clk, "emmc-sdio", video_pll_clk_pd, 0x204, BIT(30), 0); +static CCU_GATE(CLK_EMMC_SDIO, emmc_sdio_clk, "emmc-sdio", emmc_sdio_ref_clk_pd, 0x204, BIT(30), 0); static CCU_GATE(CLK_GMAC1, gmac1_clk, "gmac1", gmac_pll_clk_pd, 0x204, BIT(26), 0); static CCU_GATE(CLK_PADCTRL1, padctrl1_clk, "padctrl1", perisys_apb_pclk_pd, 0x204, BIT(24), 0); static CCU_GATE(CLK_DSMART, dsmart_clk, "dsmart", perisys_apb_pclk_pd, 0x204, BIT(23), 0); @@ -1059,6 +1066,10 @@ static int th1520_clk_probe(struct platform_device *pdev) return ret; priv->hws[CLK_PLL_GMAC_100M] = &gmac_pll_clk_100m.hw; + ret = devm_clk_hw_register(dev, &emmc_sdio_ref_clk.hw); + if (ret) + return ret; + ret = devm_of_clk_add_hw_provider(dev, of_clk_hw_onecell_get, priv); if (ret) return ret; diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index ec5db1478b2f..18ae45dcbfb2 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -442,7 +442,7 @@ static int ebs_iterate_devices(struct dm_target *ti, static struct target_type ebs_target = { .name = "ebs", .version = {1, 0, 1}, - .features = DM_TARGET_PASSES_INTEGRITY, + .features = 0, .module = THIS_MODULE, .ctr = ebs_ctr, .dtr = ebs_dtr, diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index bf0f9dddd146..05cf4e3f2bbe 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -2332,10 +2332,9 @@ static struct thin_c *get_first_thin(struct pool *pool) struct thin_c *tc = NULL; rcu_read_lock(); - if (!list_empty(&pool->active_thins)) { - tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list); + tc = list_first_or_null_rcu(&pool->active_thins, struct thin_c, list); + if (tc) thin_get(tc); - } rcu_read_unlock(); return tc; diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 62b1a44b8dd2..e61855da6461 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -40,35 +40,23 @@ static inline u64 fec_interleave(struct dm_verity *v, u64 offset) } /* - * Decode an RS block using Reed-Solomon. - */ -static int fec_decode_rs8(struct dm_verity *v, struct dm_verity_fec_io *fio, - u8 *data, u8 *fec, int neras) -{ - int i; - uint16_t par[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN]; - - for (i = 0; i < v->fec->roots; i++) - par[i] = fec[i]; - - return decode_rs8(fio->rs, data, par, v->fec->rsn, NULL, neras, - fio->erasures, 0, NULL); -} - -/* * Read error-correcting codes for the requested RS block. Returns a pointer * to the data block. Caller is responsible for releasing buf. */ static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index, - unsigned int *offset, struct dm_buffer **buf, - unsigned short ioprio) + unsigned int *offset, unsigned int par_buf_offset, + struct dm_buffer **buf, unsigned short ioprio) { u64 position, block, rem; u8 *res; + /* We have already part of parity bytes read, skip to the next block */ + if (par_buf_offset) + index++; + position = (index + rsb) * v->fec->roots; block = div64_u64_rem(position, v->fec->io_size, &rem); - *offset = (unsigned int)rem; + *offset = par_buf_offset ? 0 : (unsigned int)rem; res = dm_bufio_read_with_ioprio(v->fec->bufio, block, buf, ioprio); if (IS_ERR(res)) { @@ -128,11 +116,13 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, { int r, corrected = 0, res; struct dm_buffer *buf; - unsigned int n, i, offset; + unsigned int n, i, j, offset, par_buf_offset = 0; + uint16_t par_buf[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN]; u8 *par, *block; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); - par = fec_read_parity(v, rsb, block_offset, &offset, &buf, bio_prio(bio)); + par = fec_read_parity(v, rsb, block_offset, &offset, + par_buf_offset, &buf, bio_prio(bio)); if (IS_ERR(par)) return PTR_ERR(par); @@ -142,7 +132,11 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, */ fec_for_each_buffer_rs_block(fio, n, i) { block = fec_buffer_rs_block(v, fio, n, i); - res = fec_decode_rs8(v, fio, block, &par[offset], neras); + for (j = 0; j < v->fec->roots - par_buf_offset; j++) + par_buf[par_buf_offset + j] = par[offset + j]; + /* Decode an RS block using Reed-Solomon */ + res = decode_rs8(fio->rs, block, par_buf, v->fec->rsn, + NULL, neras, fio->erasures, 0, NULL); if (res < 0) { r = res; goto error; @@ -155,12 +149,22 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, if (block_offset >= 1 << v->data_dev_block_bits) goto done; - /* read the next block when we run out of parity bytes */ - offset += v->fec->roots; + /* Read the next block when we run out of parity bytes */ + offset += (v->fec->roots - par_buf_offset); + /* Check if parity bytes are split between blocks */ + if (offset < v->fec->io_size && (offset + v->fec->roots) > v->fec->io_size) { + par_buf_offset = v->fec->io_size - offset; + for (j = 0; j < par_buf_offset; j++) + par_buf[j] = par[offset + j]; + offset += par_buf_offset; + } else + par_buf_offset = 0; + if (offset >= v->fec->io_size) { dm_bufio_release(buf); - par = fec_read_parity(v, rsb, block_offset, &offset, &buf, bio_prio(bio)); + par = fec_read_parity(v, rsb, block_offset, &offset, + par_buf_offset, &buf, bio_prio(bio)); if (IS_ERR(par)) return PTR_ERR(par); } @@ -724,10 +728,7 @@ int verity_fec_ctr(struct dm_verity *v) return -E2BIG; } - if ((f->roots << SECTOR_SHIFT) & ((1 << v->data_dev_block_bits) - 1)) - f->io_size = 1 << v->data_dev_block_bits; - else - f->io_size = v->fec->roots << SECTOR_SHIFT; + f->io_size = 1 << v->data_dev_block_bits; f->bufio = dm_bufio_client_create(f->dev->bdev, f->io_size, diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index 157c9bd2fed7..8f8792e55806 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -917,23 +917,27 @@ static int load_ablock(struct dm_array_cursor *c) if (c->block) unlock_ablock(c->info, c->block); - c->block = NULL; - c->ab = NULL; c->index = 0; r = dm_btree_cursor_get_value(&c->cursor, &key, &value_le); if (r) { DMERR("dm_btree_cursor_get_value failed"); - dm_btree_cursor_end(&c->cursor); + goto out; } else { r = get_ablock(c->info, le64_to_cpu(value_le), &c->block, &c->ab); if (r) { DMERR("get_ablock failed"); - dm_btree_cursor_end(&c->cursor); + goto out; } } + return 0; + +out: + dm_btree_cursor_end(&c->cursor); + c->block = NULL; + c->ab = NULL; return r; } @@ -956,10 +960,10 @@ EXPORT_SYMBOL_GPL(dm_array_cursor_begin); void dm_array_cursor_end(struct dm_array_cursor *c) { - if (c->block) { + if (c->block) unlock_ablock(c->info, c->block); - dm_btree_cursor_end(&c->cursor); - } + + dm_btree_cursor_end(&c->cursor); } EXPORT_SYMBOL_GPL(dm_array_cursor_end); @@ -999,6 +1003,7 @@ int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count) } count -= remaining; + c->index += (remaining - 1); r = dm_array_cursor_next(c); } while (!r); diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index 681643ab3780..5ec3170b896a 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -85,8 +85,6 @@ const char *can_get_state_str(const enum can_state state) default: return "<unknown>"; } - - return "<unknown>"; } EXPORT_SYMBOL_GPL(can_get_state_str); diff --git a/drivers/net/can/grcan.c b/drivers/net/can/grcan.c index cdf0ec9fa7f3..21a61b86f67d 100644 --- a/drivers/net/can/grcan.c +++ b/drivers/net/can/grcan.c @@ -1073,9 +1073,10 @@ static int grcan_open(struct net_device *dev) if (err) goto exit_close_candev; + napi_enable(&priv->napi); + spin_lock_irqsave(&priv->lock, flags); - napi_enable(&priv->napi); grcan_start(dev); if (!(priv->can.ctrlmode & CAN_CTRLMODE_LISTENONLY)) netif_start_queue(dev); diff --git a/drivers/net/can/kvaser_pciefd.c b/drivers/net/can/kvaser_pciefd.c index fee012b57f33..fa04a7ced02b 100644 --- a/drivers/net/can/kvaser_pciefd.c +++ b/drivers/net/can/kvaser_pciefd.c @@ -999,7 +999,8 @@ static int kvaser_pciefd_setup_can_ctrls(struct kvaser_pciefd *pcie) can->can.ctrlmode_supported = CAN_CTRLMODE_LISTENONLY | CAN_CTRLMODE_FD | CAN_CTRLMODE_FD_NON_ISO | - CAN_CTRLMODE_CC_LEN8_DLC; + CAN_CTRLMODE_CC_LEN8_DLC | + CAN_CTRLMODE_BERR_REPORTING; status = ioread32(can->reg_base + KVASER_PCIEFD_KCAN_STAT_REG); if (!(status & KVASER_PCIEFD_KCAN_STAT_FD)) { @@ -1234,11 +1235,15 @@ static int kvaser_pciefd_handle_data_packet(struct kvaser_pciefd *pcie, } static void kvaser_pciefd_change_state(struct kvaser_pciefd_can *can, + const struct can_berr_counter *bec, struct can_frame *cf, enum can_state new_state, enum can_state tx_state, enum can_state rx_state) { + enum can_state old_state; + + old_state = can->can.state; can_change_state(can->can.dev, cf, tx_state, rx_state); if (new_state == CAN_STATE_BUS_OFF) { @@ -1254,6 +1259,18 @@ static void kvaser_pciefd_change_state(struct kvaser_pciefd_can *can, can_bus_off(ndev); } } + if (old_state == CAN_STATE_BUS_OFF && + new_state == CAN_STATE_ERROR_ACTIVE && + can->can.restart_ms) { + can->can.can_stats.restarts++; + if (cf) + cf->can_id |= CAN_ERR_RESTARTED; + } + if (cf && new_state != CAN_STATE_BUS_OFF) { + cf->can_id |= CAN_ERR_CNT; + cf->data[6] = bec->txerr; + cf->data[7] = bec->rxerr; + } } static void kvaser_pciefd_packet_to_state(struct kvaser_pciefd_rx_packet *p, @@ -1288,7 +1305,7 @@ static int kvaser_pciefd_rx_error_frame(struct kvaser_pciefd_can *can, struct can_berr_counter bec; enum can_state old_state, new_state, tx_state, rx_state; struct net_device *ndev = can->can.dev; - struct sk_buff *skb; + struct sk_buff *skb = NULL; struct can_frame *cf = NULL; old_state = can->can.state; @@ -1297,16 +1314,10 @@ static int kvaser_pciefd_rx_error_frame(struct kvaser_pciefd_can *can, bec.rxerr = FIELD_GET(KVASER_PCIEFD_SPACK_RXERR_MASK, p->header[0]); kvaser_pciefd_packet_to_state(p, &bec, &new_state, &tx_state, &rx_state); - skb = alloc_can_err_skb(ndev, &cf); + if (can->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING) + skb = alloc_can_err_skb(ndev, &cf); if (new_state != old_state) { - kvaser_pciefd_change_state(can, cf, new_state, tx_state, rx_state); - if (old_state == CAN_STATE_BUS_OFF && - new_state == CAN_STATE_ERROR_ACTIVE && - can->can.restart_ms) { - can->can.can_stats.restarts++; - if (skb) - cf->can_id |= CAN_ERR_RESTARTED; - } + kvaser_pciefd_change_state(can, &bec, cf, new_state, tx_state, rx_state); } can->err_rep_cnt++; @@ -1319,18 +1330,19 @@ static int kvaser_pciefd_rx_error_frame(struct kvaser_pciefd_can *can, can->bec.txerr = bec.txerr; can->bec.rxerr = bec.rxerr; - if (!skb) { - ndev->stats.rx_dropped++; - return -ENOMEM; + if (can->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING) { + if (!skb) { + netdev_warn(ndev, "No memory left for err_skb\n"); + ndev->stats.rx_dropped++; + return -ENOMEM; + } + kvaser_pciefd_set_skb_timestamp(can->kv_pcie, skb, p->timestamp); + cf->can_id |= CAN_ERR_BUSERROR | CAN_ERR_CNT; + cf->data[6] = bec.txerr; + cf->data[7] = bec.rxerr; + netif_rx(skb); } - kvaser_pciefd_set_skb_timestamp(can->kv_pcie, skb, p->timestamp); - cf->can_id |= CAN_ERR_BUSERROR | CAN_ERR_CNT; - cf->data[6] = bec.txerr; - cf->data[7] = bec.rxerr; - - netif_rx(skb); - return 0; } @@ -1359,6 +1371,7 @@ static int kvaser_pciefd_handle_status_resp(struct kvaser_pciefd_can *can, { struct can_berr_counter bec; enum can_state old_state, new_state, tx_state, rx_state; + int ret = 0; old_state = can->can.state; @@ -1372,25 +1385,15 @@ static int kvaser_pciefd_handle_status_resp(struct kvaser_pciefd_can *can, struct can_frame *cf; skb = alloc_can_err_skb(ndev, &cf); - if (!skb) { + kvaser_pciefd_change_state(can, &bec, cf, new_state, tx_state, rx_state); + if (skb) { + kvaser_pciefd_set_skb_timestamp(can->kv_pcie, skb, p->timestamp); + netif_rx(skb); + } else { ndev->stats.rx_dropped++; - return -ENOMEM; + netdev_warn(ndev, "No memory left for err_skb\n"); + ret = -ENOMEM; } - - kvaser_pciefd_change_state(can, cf, new_state, tx_state, rx_state); - if (old_state == CAN_STATE_BUS_OFF && - new_state == CAN_STATE_ERROR_ACTIVE && - can->can.restart_ms) { - can->can.can_stats.restarts++; - cf->can_id |= CAN_ERR_RESTARTED; - } - - kvaser_pciefd_set_skb_timestamp(can->kv_pcie, skb, p->timestamp); - - cf->data[6] = bec.txerr; - cf->data[7] = bec.rxerr; - - netif_rx(skb); } can->bec.txerr = bec.txerr; can->bec.rxerr = bec.rxerr; @@ -1398,7 +1401,7 @@ static int kvaser_pciefd_handle_status_resp(struct kvaser_pciefd_can *can, if (bec.txerr || bec.rxerr) mod_timer(&can->bec_poll_timer, KVASER_PCIEFD_BEC_POLL_FREQ); - return 0; + return ret; } static int kvaser_pciefd_handle_status_packet(struct kvaser_pciefd *pcie, diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index 97cd8bbf2e32..d025d4163fd1 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -1785,6 +1785,13 @@ static void m_can_stop(struct net_device *dev) /* set the state as STOPPED */ cdev->can.state = CAN_STATE_STOPPED; + + if (cdev->ops->deinit) { + ret = cdev->ops->deinit(cdev); + if (ret) + netdev_err(dev, "failed to deinitialize: %pe\n", + ERR_PTR(ret)); + } } static int m_can_close(struct net_device *dev) @@ -2466,6 +2473,7 @@ int m_can_class_suspend(struct device *dev) { struct m_can_classdev *cdev = dev_get_drvdata(dev); struct net_device *ndev = cdev->net; + int ret = 0; if (netif_running(ndev)) { netif_stop_queue(ndev); @@ -2478,6 +2486,9 @@ int m_can_class_suspend(struct device *dev) if (cdev->pm_wake_source) { hrtimer_cancel(&cdev->hrtimer); m_can_write(cdev, M_CAN_IE, IR_RF0N); + + if (cdev->ops->deinit) + ret = cdev->ops->deinit(cdev); } else { m_can_stop(ndev); } @@ -2489,7 +2500,7 @@ int m_can_class_suspend(struct device *dev) cdev->can.state = CAN_STATE_SLEEPING; - return 0; + return ret; } EXPORT_SYMBOL_GPL(m_can_class_suspend); @@ -2497,14 +2508,13 @@ int m_can_class_resume(struct device *dev) { struct m_can_classdev *cdev = dev_get_drvdata(dev); struct net_device *ndev = cdev->net; + int ret = 0; pinctrl_pm_select_default_state(dev); cdev->can.state = CAN_STATE_ERROR_ACTIVE; if (netif_running(ndev)) { - int ret; - ret = m_can_clk_start(cdev); if (ret) return ret; @@ -2517,6 +2527,10 @@ int m_can_class_resume(struct device *dev) * again. */ cdev->active_interrupts |= IR_RF0N | IR_TEFN; + + if (cdev->ops->init) + ret = cdev->ops->init(cdev); + m_can_write(cdev, M_CAN_IE, cdev->active_interrupts); } else { ret = m_can_start(ndev); @@ -2530,7 +2544,7 @@ int m_can_class_resume(struct device *dev) netif_start_queue(ndev); } - return 0; + return ret; } EXPORT_SYMBOL_GPL(m_can_class_resume); diff --git a/drivers/net/can/m_can/m_can.h b/drivers/net/can/m_can/m_can.h index ef39e8e527ab..bd4746c63af3 100644 --- a/drivers/net/can/m_can/m_can.h +++ b/drivers/net/can/m_can/m_can.h @@ -68,6 +68,7 @@ struct m_can_ops { int (*write_fifo)(struct m_can_classdev *cdev, int addr_offset, const void *val, size_t val_count); int (*init)(struct m_can_classdev *cdev); + int (*deinit)(struct m_can_classdev *cdev); }; struct m_can_tx_op { diff --git a/drivers/net/can/m_can/tcan4x5x-core.c b/drivers/net/can/m_can/tcan4x5x-core.c index 2f73bf3abad8..e5c162f8c589 100644 --- a/drivers/net/can/m_can/tcan4x5x-core.c +++ b/drivers/net/can/m_can/tcan4x5x-core.c @@ -92,6 +92,8 @@ #define TCAN4X5X_MODE_STANDBY BIT(6) #define TCAN4X5X_MODE_NORMAL BIT(7) +#define TCAN4X5X_NWKRQ_VOLTAGE_VIO BIT(19) + #define TCAN4X5X_DISABLE_WAKE_MSK (BIT(31) | BIT(30)) #define TCAN4X5X_DISABLE_INH_MSK BIT(9) @@ -267,9 +269,24 @@ static int tcan4x5x_init(struct m_can_classdev *cdev) if (ret) return ret; + if (tcan4x5x->nwkrq_voltage_vio) { + ret = regmap_set_bits(tcan4x5x->regmap, TCAN4X5X_CONFIG, + TCAN4X5X_NWKRQ_VOLTAGE_VIO); + if (ret) + return ret; + } + return ret; } +static int tcan4x5x_deinit(struct m_can_classdev *cdev) +{ + struct tcan4x5x_priv *tcan4x5x = cdev_to_priv(cdev); + + return regmap_update_bits(tcan4x5x->regmap, TCAN4X5X_CONFIG, + TCAN4X5X_MODE_SEL_MASK, TCAN4X5X_MODE_STANDBY); +}; + static int tcan4x5x_disable_wake(struct m_can_classdev *cdev) { struct tcan4x5x_priv *tcan4x5x = cdev_to_priv(cdev); @@ -318,6 +335,14 @@ static const struct tcan4x5x_version_info return &tcan4x5x_versions[TCAN4X5X]; } +static void tcan4x5x_get_dt_data(struct m_can_classdev *cdev) +{ + struct tcan4x5x_priv *tcan4x5x = cdev_to_priv(cdev); + + tcan4x5x->nwkrq_voltage_vio = + of_property_read_bool(cdev->dev->of_node, "ti,nwkrq-voltage-vio"); +} + static int tcan4x5x_get_gpios(struct m_can_classdev *cdev, const struct tcan4x5x_version_info *version_info) { @@ -359,6 +384,7 @@ static int tcan4x5x_get_gpios(struct m_can_classdev *cdev, static const struct m_can_ops tcan4x5x_ops = { .init = tcan4x5x_init, + .deinit = tcan4x5x_deinit, .read_reg = tcan4x5x_read_reg, .write_reg = tcan4x5x_write_reg, .write_fifo = tcan4x5x_write_fifo, @@ -392,7 +418,7 @@ static int tcan4x5x_can_probe(struct spi_device *spi) priv->power = NULL; } - m_can_class_get_clocks(mcan_class); + mcan_class->cclk = devm_clk_get(mcan_class->dev, "cclk"); if (IS_ERR(mcan_class->cclk)) { dev_err(&spi->dev, "no CAN clock source defined\n"); freq = TCAN4X5X_EXT_CLK_DEF; @@ -453,6 +479,8 @@ static int tcan4x5x_can_probe(struct spi_device *spi) goto out_power; } + tcan4x5x_get_dt_data(mcan_class); + tcan4x5x_check_wake(priv); ret = tcan4x5x_write_tcan_reg(mcan_class, TCAN4X5X_INT_EN, 0); diff --git a/drivers/net/can/m_can/tcan4x5x.h b/drivers/net/can/m_can/tcan4x5x.h index e62c030d3e1e..203399d5e8cc 100644 --- a/drivers/net/can/m_can/tcan4x5x.h +++ b/drivers/net/can/m_can/tcan4x5x.h @@ -42,6 +42,8 @@ struct tcan4x5x_priv { struct tcan4x5x_map_buf map_buf_rx; struct tcan4x5x_map_buf map_buf_tx; + + bool nwkrq_voltage_vio; }; static inline void diff --git a/drivers/net/can/sun4i_can.c b/drivers/net/can/sun4i_can.c index 4311c1f0eafd..6fcb301ef611 100644 --- a/drivers/net/can/sun4i_can.c +++ b/drivers/net/can/sun4i_can.c @@ -570,7 +570,7 @@ static int sun4i_can_err(struct net_device *dev, u8 isrc, u8 status) else state = CAN_STATE_ERROR_ACTIVE; } - if (skb && state != CAN_STATE_BUS_OFF) { + if (likely(skb) && state != CAN_STATE_BUS_OFF) { cf->can_id |= CAN_ERR_CNT; cf->data[6] = txerr; cf->data[7] = rxerr; diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c index 7d12776ab63e..dcb0bcbe0565 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c @@ -818,7 +818,8 @@ static int kvaser_usb_init_one(struct kvaser_usb *dev, int channel) init_completion(&priv->stop_comp); init_completion(&priv->flush_comp); init_completion(&priv->get_busparams_comp); - priv->can.ctrlmode_supported = CAN_CTRLMODE_CC_LEN8_DLC; + priv->can.ctrlmode_supported = CAN_CTRLMODE_CC_LEN8_DLC | + CAN_CTRLMODE_BERR_REPORTING; priv->dev = dev; priv->netdev = netdev; diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c index 3764b263add3..8e88b5917796 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c @@ -926,6 +926,42 @@ kvaser_usb_hydra_bus_status_to_can_state(const struct kvaser_usb_net_priv *priv, } } +static void kvaser_usb_hydra_change_state(struct kvaser_usb_net_priv *priv, + const struct can_berr_counter *bec, + struct can_frame *cf, + enum can_state new_state) +{ + struct net_device *netdev = priv->netdev; + enum can_state old_state = priv->can.state; + enum can_state tx_state, rx_state; + + tx_state = (bec->txerr >= bec->rxerr) ? + new_state : CAN_STATE_ERROR_ACTIVE; + rx_state = (bec->txerr <= bec->rxerr) ? + new_state : CAN_STATE_ERROR_ACTIVE; + can_change_state(netdev, cf, tx_state, rx_state); + + if (new_state == CAN_STATE_BUS_OFF && old_state < CAN_STATE_BUS_OFF) { + if (priv->can.restart_ms == 0) + kvaser_usb_hydra_send_simple_cmd_async(priv, CMD_STOP_CHIP_REQ); + + can_bus_off(netdev); + } + + if (priv->can.restart_ms && + old_state >= CAN_STATE_BUS_OFF && + new_state < CAN_STATE_BUS_OFF) { + priv->can.can_stats.restarts++; + if (cf) + cf->can_id |= CAN_ERR_RESTARTED; + } + if (cf && new_state != CAN_STATE_BUS_OFF) { + cf->can_id |= CAN_ERR_CNT; + cf->data[6] = bec->txerr; + cf->data[7] = bec->rxerr; + } +} + static void kvaser_usb_hydra_update_state(struct kvaser_usb_net_priv *priv, u8 bus_status, const struct can_berr_counter *bec) @@ -951,41 +987,11 @@ static void kvaser_usb_hydra_update_state(struct kvaser_usb_net_priv *priv, return; skb = alloc_can_err_skb(netdev, &cf); - if (skb) { - enum can_state tx_state, rx_state; - - tx_state = (bec->txerr >= bec->rxerr) ? - new_state : CAN_STATE_ERROR_ACTIVE; - rx_state = (bec->txerr <= bec->rxerr) ? - new_state : CAN_STATE_ERROR_ACTIVE; - can_change_state(netdev, cf, tx_state, rx_state); - } - - if (new_state == CAN_STATE_BUS_OFF && old_state < CAN_STATE_BUS_OFF) { - if (!priv->can.restart_ms) - kvaser_usb_hydra_send_simple_cmd_async - (priv, CMD_STOP_CHIP_REQ); - - can_bus_off(netdev); - } - - if (!skb) { + kvaser_usb_hydra_change_state(priv, bec, cf, new_state); + if (skb) + netif_rx(skb); + else netdev_warn(netdev, "No memory left for err_skb\n"); - return; - } - - if (priv->can.restart_ms && - old_state >= CAN_STATE_BUS_OFF && - new_state < CAN_STATE_BUS_OFF) - priv->can.can_stats.restarts++; - - if (new_state != CAN_STATE_BUS_OFF) { - cf->can_id |= CAN_ERR_CNT; - cf->data[6] = bec->txerr; - cf->data[7] = bec->rxerr; - } - - netif_rx(skb); } static void kvaser_usb_hydra_state_event(const struct kvaser_usb *dev, @@ -1078,9 +1084,8 @@ kvaser_usb_hydra_error_frame(struct kvaser_usb_net_priv *priv, { struct net_device *netdev = priv->netdev; struct net_device_stats *stats = &netdev->stats; - struct can_frame *cf; - struct sk_buff *skb; - struct skb_shared_hwtstamps *shhwtstamps; + struct can_frame *cf = NULL; + struct sk_buff *skb = NULL; struct can_berr_counter bec; enum can_state new_state, old_state; u8 bus_status; @@ -1096,52 +1101,26 @@ kvaser_usb_hydra_error_frame(struct kvaser_usb_net_priv *priv, kvaser_usb_hydra_bus_status_to_can_state(priv, bus_status, &bec, &new_state); - skb = alloc_can_err_skb(netdev, &cf); + if (priv->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING) + skb = alloc_can_err_skb(netdev, &cf); + if (new_state != old_state) + kvaser_usb_hydra_change_state(priv, &bec, cf, new_state); - if (new_state != old_state) { + if (priv->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING) { if (skb) { - enum can_state tx_state, rx_state; - - tx_state = (bec.txerr >= bec.rxerr) ? - new_state : CAN_STATE_ERROR_ACTIVE; - rx_state = (bec.txerr <= bec.rxerr) ? - new_state : CAN_STATE_ERROR_ACTIVE; - - can_change_state(netdev, cf, tx_state, rx_state); - - if (priv->can.restart_ms && - old_state >= CAN_STATE_BUS_OFF && - new_state < CAN_STATE_BUS_OFF) - cf->can_id |= CAN_ERR_RESTARTED; - } - - if (new_state == CAN_STATE_BUS_OFF) { - if (!priv->can.restart_ms) - kvaser_usb_hydra_send_simple_cmd_async - (priv, CMD_STOP_CHIP_REQ); + struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); - can_bus_off(netdev); + shhwtstamps->hwtstamp = hwtstamp; + cf->can_id |= CAN_ERR_BUSERROR | CAN_ERR_CNT; + cf->data[6] = bec.txerr; + cf->data[7] = bec.rxerr; + netif_rx(skb); + } else { + stats->rx_dropped++; + netdev_warn(netdev, "No memory left for err_skb\n"); } } - if (!skb) { - stats->rx_dropped++; - netdev_warn(netdev, "No memory left for err_skb\n"); - return; - } - - shhwtstamps = skb_hwtstamps(skb); - shhwtstamps->hwtstamp = hwtstamp; - - cf->can_id |= CAN_ERR_BUSERROR; - if (new_state != CAN_STATE_BUS_OFF) { - cf->can_id |= CAN_ERR_CNT; - cf->data[6] = bec.txerr; - cf->data[7] = bec.rxerr; - } - - netif_rx(skb); - priv->bec.txerr = bec.txerr; priv->bec.rxerr = bec.rxerr; } diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c index 6b9122ab1464..6a45adcc45bd 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c @@ -1120,10 +1120,8 @@ kvaser_usb_leaf_rx_error_update_can_state(struct kvaser_usb_net_priv *priv, static void kvaser_usb_leaf_rx_error(const struct kvaser_usb *dev, const struct kvaser_usb_err_summary *es) { - struct can_frame *cf; - struct can_frame tmp_cf = { .can_id = CAN_ERR_FLAG, - .len = CAN_ERR_DLC }; - struct sk_buff *skb; + struct can_frame *cf = NULL; + struct sk_buff *skb = NULL; struct net_device_stats *stats; struct kvaser_usb_net_priv *priv; struct kvaser_usb_net_leaf_priv *leaf; @@ -1143,18 +1141,10 @@ static void kvaser_usb_leaf_rx_error(const struct kvaser_usb *dev, if (!netif_running(priv->netdev)) return; - /* Update all of the CAN interface's state and error counters before - * trying any memory allocation that can actually fail with -ENOMEM. - * - * We send a temporary stack-allocated error CAN frame to - * can_change_state() for the very same reason. - * - * TODO: Split can_change_state() responsibility between updating the - * CAN interface's state and counters, and the setting up of CAN error - * frame ID and data to userspace. Remove stack allocation afterwards. - */ old_state = priv->can.state; - kvaser_usb_leaf_rx_error_update_can_state(priv, es, &tmp_cf); + if (priv->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING) + skb = alloc_can_err_skb(priv->netdev, &cf); + kvaser_usb_leaf_rx_error_update_can_state(priv, es, cf); new_state = priv->can.state; /* If there are errors, request status updates periodically as we do @@ -1168,13 +1158,6 @@ static void kvaser_usb_leaf_rx_error(const struct kvaser_usb *dev, schedule_delayed_work(&leaf->chip_state_req_work, msecs_to_jiffies(500)); - skb = alloc_can_err_skb(priv->netdev, &cf); - if (!skb) { - stats->rx_dropped++; - return; - } - memcpy(cf, &tmp_cf, sizeof(*cf)); - if (new_state != old_state) { if (es->status & (M16C_STATE_BUS_OFF | M16C_STATE_BUS_RESET)) { @@ -1187,11 +1170,20 @@ static void kvaser_usb_leaf_rx_error(const struct kvaser_usb *dev, if (priv->can.restart_ms && old_state == CAN_STATE_BUS_OFF && new_state < CAN_STATE_BUS_OFF) { - cf->can_id |= CAN_ERR_RESTARTED; + if (cf) + cf->can_id |= CAN_ERR_RESTARTED; netif_carrier_on(priv->netdev); } } + if (!skb) { + if (priv->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING) { + stats->rx_dropped++; + netdev_warn(priv->netdev, "No memory left for err_skb\n"); + } + return; + } + switch (dev->driver_info->family) { case KVASER_LEAF: if (es->leaf.error_factor) { diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 0561b60f668f..79dc77835681 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -2232,12 +2232,6 @@ bool b53_support_eee(struct dsa_switch *ds, int port) } EXPORT_SYMBOL(b53_support_eee); -int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e) -{ - return 0; -} -EXPORT_SYMBOL(b53_get_mac_eee); - int b53_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e) { struct b53_device *dev = ds->priv; @@ -2299,7 +2293,6 @@ static const struct dsa_switch_ops b53_switch_ops = { .port_enable = b53_enable_port, .port_disable = b53_disable_port, .support_eee = b53_support_eee, - .get_mac_eee = b53_get_mac_eee, .set_mac_eee = b53_set_mac_eee, .port_bridge_join = b53_br_join, .port_bridge_leave = b53_br_leave, diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 99e5cfc98ae8..9e9b5bc0c5d6 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -385,7 +385,6 @@ void b53_disable_port(struct dsa_switch *ds, int port); void b53_brcm_hdr_setup(struct dsa_switch *ds, int port); int b53_eee_init(struct dsa_switch *ds, int port, struct phy_device *phy); bool b53_support_eee(struct dsa_switch *ds, int port); -int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e); int b53_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e); #endif diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index a53fb6191e6b..fa2bf3fa9019 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -1233,7 +1233,6 @@ static const struct dsa_switch_ops bcm_sf2_ops = { .port_enable = bcm_sf2_port_setup, .port_disable = bcm_sf2_port_disable, .support_eee = b53_support_eee, - .get_mac_eee = b53_get_mac_eee, .set_mac_eee = b53_set_mac_eee, .port_bridge_join = b53_br_join, .port_bridge_leave = b53_br_leave, diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index e3512e324572..89f0796894af 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -3489,20 +3489,6 @@ static bool ksz_support_eee(struct dsa_switch *ds, int port) return false; } -static int ksz_get_mac_eee(struct dsa_switch *ds, int port, - struct ethtool_keee *e) -{ - /* There is no documented control of Tx LPI configuration. */ - e->tx_lpi_enabled = true; - - /* There is no documented control of Tx LPI timer. According to tests - * Tx LPI timer seems to be set by default to minimal value. - */ - e->tx_lpi_timer = 0; - - return 0; -} - static int ksz_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e) { @@ -4672,7 +4658,6 @@ static const struct dsa_switch_ops ksz_switch_ops = { .cls_flower_del = ksz_cls_flower_del, .port_setup_tc = ksz_setup_tc, .support_eee = ksz_support_eee, - .get_mac_eee = ksz_get_mac_eee, .set_mac_eee = ksz_set_mac_eee, .port_get_default_prio = ksz_port_get_default_prio, .port_set_default_prio = ksz_port_set_default_prio, diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 9605febd3573..d2d0f091e49e 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -3085,18 +3085,6 @@ mt753x_setup(struct dsa_switch *ds) return ret; } -static int mt753x_get_mac_eee(struct dsa_switch *ds, int port, - struct ethtool_keee *e) -{ - struct mt7530_priv *priv = ds->priv; - u32 eeecr = mt7530_read(priv, MT753X_PMEEECR_P(port)); - - e->tx_lpi_enabled = !(eeecr & LPI_MODE_EN); - e->tx_lpi_timer = LPI_THRESH_GET(eeecr); - - return 0; -} - static int mt753x_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e) { @@ -3239,7 +3227,6 @@ const struct dsa_switch_ops mt7530_switch_ops = { .port_mirror_del = mt753x_port_mirror_del, .phylink_get_caps = mt753x_phylink_get_caps, .support_eee = dsa_supports_eee, - .get_mac_eee = mt753x_get_mac_eee, .set_mac_eee = mt753x_set_mac_eee, .conduit_state_change = mt753x_conduit_state_change, .port_setup_tc = mt753x_setup_tc, diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 570c8642d387..35ae084af166 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -1513,13 +1513,6 @@ static void mv88e6xxx_get_regs(struct dsa_switch *ds, int port, mv88e6xxx_reg_unlock(chip); } -static int mv88e6xxx_get_mac_eee(struct dsa_switch *ds, int port, - struct ethtool_keee *e) -{ - /* Nothing to do on the port's MAC */ - return 0; -} - static int mv88e6xxx_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e) { @@ -7100,7 +7093,6 @@ static const struct dsa_switch_ops mv88e6xxx_switch_ops = { .port_max_mtu = mv88e6xxx_get_max_mtu, .port_change_mtu = mv88e6xxx_change_mtu, .support_eee = dsa_supports_eee, - .get_mac_eee = mv88e6xxx_get_mac_eee, .set_mac_eee = mv88e6xxx_set_mac_eee, .get_eeprom_len = mv88e6xxx_get_eeprom_len, .get_eeprom = mv88e6xxx_get_eeprom, diff --git a/drivers/net/dsa/qca/qca8k-8xxx.c b/drivers/net/dsa/qca/qca8k-8xxx.c index 90e24bc00b99..750fc76a6e11 100644 --- a/drivers/net/dsa/qca/qca8k-8xxx.c +++ b/drivers/net/dsa/qca/qca8k-8xxx.c @@ -1019,7 +1019,7 @@ qca8k_setup_mdio_bus(struct qca8k_priv *priv) of_get_phy_mode(port, &mode); - if (of_property_read_bool(port, "phy-handle") && + if (of_property_present(port, "phy-handle") && mode != PHY_INTERFACE_MODE_INTERNAL) external_mdio_mask |= BIT(reg); else @@ -2017,7 +2017,6 @@ static const struct dsa_switch_ops qca8k_switch_ops = { .get_sset_count = qca8k_get_sset_count, .set_ageing_time = qca8k_set_ageing_time, .support_eee = dsa_supports_eee, - .get_mac_eee = qca8k_get_mac_eee, .set_mac_eee = qca8k_set_mac_eee, .port_enable = qca8k_port_enable, .port_disable = qca8k_port_disable, diff --git a/drivers/net/dsa/qca/qca8k-common.c b/drivers/net/dsa/qca/qca8k-common.c index 560c74c4ac3d..13005f10edb7 100644 --- a/drivers/net/dsa/qca/qca8k-common.c +++ b/drivers/net/dsa/qca/qca8k-common.c @@ -557,13 +557,6 @@ exit: return ret; } -int qca8k_get_mac_eee(struct dsa_switch *ds, int port, - struct ethtool_keee *e) -{ - /* Nothing to do on the port's MAC */ - return 0; -} - static int qca8k_port_configure_learning(struct dsa_switch *ds, int port, bool learning) { diff --git a/drivers/net/dsa/qca/qca8k.h b/drivers/net/dsa/qca/qca8k.h index 24962a395754..d046679265fa 100644 --- a/drivers/net/dsa/qca/qca8k.h +++ b/drivers/net/dsa/qca/qca8k.h @@ -520,7 +520,6 @@ int qca8k_get_sset_count(struct dsa_switch *ds, int port, int sset); /* Common eee function */ int qca8k_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *eee); -int qca8k_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e); /* Common bridge function */ void qca8k_port_stp_state_set(struct dsa_switch *ds, int port, u8 state); diff --git a/drivers/net/ethernet/amd/pds_core/devlink.c b/drivers/net/ethernet/amd/pds_core/devlink.c index 2681889162a2..44971e71991f 100644 --- a/drivers/net/ethernet/amd/pds_core/devlink.c +++ b/drivers/net/ethernet/amd/pds_core/devlink.c @@ -118,7 +118,7 @@ int pdsc_dl_info_get(struct devlink *dl, struct devlink_info_req *req, if (err && err != -EIO) return err; - listlen = fw_list.num_fw_slots; + listlen = min(fw_list.num_fw_slots, ARRAY_SIZE(fw_list.fw_names)); for (i = 0; i < listlen; i++) { if (i < ARRAY_SIZE(fw_slotnames)) strscpy(buf, fw_slotnames[i], sizeof(buf)); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 46edea75e062..884d42db5554 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -2897,6 +2897,13 @@ static int bnxt_hwrm_handler(struct bnxt *bp, struct tx_cmp *txcmp) return 0; } +static bool bnxt_vnic_is_active(struct bnxt *bp) +{ + struct bnxt_vnic_info *vnic = &bp->vnic_info[0]; + + return vnic->fw_vnic_id != INVALID_HW_RING_ID && vnic->mru > 0; +} + static irqreturn_t bnxt_msix(int irq, void *dev_instance) { struct bnxt_napi *bnapi = dev_instance; @@ -3164,7 +3171,7 @@ static int bnxt_poll(struct napi_struct *napi, int budget) break; } } - if (bp->flags & BNXT_FLAG_DIM) { + if ((bp->flags & BNXT_FLAG_DIM) && bnxt_vnic_is_active(bp)) { struct dim_sample dim_sample = {}; dim_update_sample(cpr->event_ctr, @@ -3295,7 +3302,7 @@ static int bnxt_poll_p5(struct napi_struct *napi, int budget) poll_done: cpr_rx = &cpr->cp_ring_arr[0]; if (cpr_rx->cp_ring_type == BNXT_NQ_HDL_TYPE_RX && - (bp->flags & BNXT_FLAG_DIM)) { + (bp->flags & BNXT_FLAG_DIM) && bnxt_vnic_is_active(bp)) { struct dim_sample dim_sample = {}; dim_update_sample(cpr->event_ctr, @@ -7266,6 +7273,26 @@ err_out: return rc; } +static void bnxt_cancel_dim(struct bnxt *bp) +{ + int i; + + /* DIM work is initialized in bnxt_enable_napi(). Proceed only + * if NAPI is enabled. + */ + if (!bp->bnapi || test_bit(BNXT_STATE_NAPI_DISABLED, &bp->state)) + return; + + /* Make sure NAPI sees that the VNIC is disabled */ + synchronize_net(); + for (i = 0; i < bp->rx_nr_rings; i++) { + struct bnxt_rx_ring_info *rxr = &bp->rx_ring[i]; + struct bnxt_napi *bnapi = rxr->bnapi; + + cancel_work_sync(&bnapi->cp_ring.dim.work); + } +} + static int hwrm_ring_free_send_msg(struct bnxt *bp, struct bnxt_ring_struct *ring, u32 ring_type, int cmpl_ring_id) @@ -7366,6 +7393,7 @@ static void bnxt_hwrm_ring_free(struct bnxt *bp, bool close_path) } } + bnxt_cancel_dim(bp); for (i = 0; i < bp->rx_nr_rings; i++) { bnxt_hwrm_rx_ring_free(bp, &bp->rx_ring[i], close_path); bnxt_hwrm_rx_agg_ring_free(bp, &bp->rx_ring[i], close_path); @@ -11330,8 +11358,6 @@ static void bnxt_disable_napi(struct bnxt *bp) if (bnapi->in_reset) cpr->sw_stats->rx.rx_resets++; napi_disable(&bnapi->napi); - if (bnapi->rx_ring) - cancel_work_sync(&cpr->dim.work); } } @@ -15613,8 +15639,10 @@ static int bnxt_queue_stop(struct net_device *dev, void *qmem, int idx) bnxt_hwrm_vnic_update(bp, vnic, VNIC_UPDATE_REQ_ENABLES_MRU_VALID); } - + /* Make sure NAPI sees that the VNIC is disabled */ + synchronize_net(); rxr = &bp->rx_ring[idx]; + cancel_work_sync(&rxr->bnapi->cp_ring.dim.work); bnxt_hwrm_rx_ring_free(bp, rxr, false); bnxt_hwrm_rx_agg_ring_free(bp, rxr, false); rxr->rx_next_cons = 0; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index 94c6a0928ca0..7c88b9f05c4c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -208,7 +208,7 @@ int bnxt_send_msg(struct bnxt_en_dev *edev, rc = hwrm_req_replace(bp, req, fw_msg->msg, fw_msg->msg_len); if (rc) - return rc; + goto drop_req; hwrm_req_timeout(bp, req, fw_msg->timeout); resp = hwrm_req_hold(bp, req); @@ -220,6 +220,7 @@ int bnxt_send_msg(struct bnxt_en_dev *edev, memcpy(fw_msg->resp, resp, resp_len); } +drop_req: hwrm_req_drop(bp, req); return rc; } diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index bc3af0054406..2f0b3e389e62 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -1799,7 +1799,10 @@ void cxgb4_remove_tid(struct tid_info *t, unsigned int chan, unsigned int tid, struct adapter *adap = container_of(t, struct adapter, tids); struct sk_buff *skb; - WARN_ON(tid_out_of_range(&adap->tids, tid)); + if (tid_out_of_range(&adap->tids, tid)) { + dev_err(adap->pdev_dev, "tid %d out of range\n", tid); + return; + } if (t->tid_tab[tid - adap->tids.tid_base]) { t->tid_tab[tid - adap->tids.tid_base] = NULL; @@ -6559,6 +6562,9 @@ static void cxgb4_advance_esn_state(struct xfrm_state *x) { struct adapter *adap = netdev2adap(x->xso.dev); + if (x->xso.dir != XFRM_DEV_OFFLOAD_IN) + return; + if (!mutex_trylock(&uld_mutex)) { dev_dbg(adap->pdev_dev, "crypto uld critical resource is under use\n"); diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c index 9913952ccb42..49f6cab01ed5 100644 --- a/drivers/net/ethernet/cisco/enic/enic_main.c +++ b/drivers/net/ethernet/cisco/enic/enic_main.c @@ -109,7 +109,7 @@ static struct enic_intr_mod_table mod_table[ENIC_MAX_COALESCE_TIMERS + 1] = { static struct enic_intr_mod_range mod_range[ENIC_MAX_LINK_SPEEDS] = { {0, 0}, /* 0 - 4 Gbps */ {0, 3}, /* 4 - 10 Gbps */ - {3, 6}, /* 10 - 40 Gbps */ + {3, 6}, /* 10+ Gbps */ }; static void enic_init_affinity_hint(struct enic *enic) @@ -428,6 +428,36 @@ static void enic_mtu_check(struct enic *enic) } } +static void enic_set_rx_coal_setting(struct enic *enic) +{ + unsigned int speed; + int index = -1; + struct enic_rx_coal *rx_coal = &enic->rx_coalesce_setting; + + /* 1. Read the link speed from fw + * 2. Pick the default range for the speed + * 3. Update it in enic->rx_coalesce_setting + */ + speed = vnic_dev_port_speed(enic->vdev); + if (speed > ENIC_LINK_SPEED_10G) + index = ENIC_LINK_40G_INDEX; + else if (speed > ENIC_LINK_SPEED_4G) + index = ENIC_LINK_10G_INDEX; + else + index = ENIC_LINK_4G_INDEX; + + rx_coal->small_pkt_range_start = mod_range[index].small_pkt_range_start; + rx_coal->large_pkt_range_start = mod_range[index].large_pkt_range_start; + rx_coal->range_end = ENIC_RX_COALESCE_RANGE_END; + + /* Start with the value provided by UCSM */ + for (index = 0; index < enic->rq_count; index++) + enic->cq[index].cur_rx_coal_timeval = + enic->config.intr_timer_usec; + + rx_coal->use_adaptive_rx_coalesce = 1; +} + static void enic_link_check(struct enic *enic) { int link_status = vnic_dev_link_status(enic->vdev); @@ -436,6 +466,7 @@ static void enic_link_check(struct enic *enic) if (link_status && !carrier_ok) { netdev_info(enic->netdev, "Link UP\n"); netif_carrier_on(enic->netdev); + enic_set_rx_coal_setting(enic); } else if (!link_status && carrier_ok) { netdev_info(enic->netdev, "Link DOWN\n"); netif_carrier_off(enic->netdev); @@ -1901,36 +1932,6 @@ static void enic_synchronize_irqs(struct enic *enic) } } -static void enic_set_rx_coal_setting(struct enic *enic) -{ - unsigned int speed; - int index = -1; - struct enic_rx_coal *rx_coal = &enic->rx_coalesce_setting; - - /* 1. Read the link speed from fw - * 2. Pick the default range for the speed - * 3. Update it in enic->rx_coalesce_setting - */ - speed = vnic_dev_port_speed(enic->vdev); - if (ENIC_LINK_SPEED_10G < speed) - index = ENIC_LINK_40G_INDEX; - else if (ENIC_LINK_SPEED_4G < speed) - index = ENIC_LINK_10G_INDEX; - else - index = ENIC_LINK_4G_INDEX; - - rx_coal->small_pkt_range_start = mod_range[index].small_pkt_range_start; - rx_coal->large_pkt_range_start = mod_range[index].large_pkt_range_start; - rx_coal->range_end = ENIC_RX_COALESCE_RANGE_END; - - /* Start with the value provided by UCSM */ - for (index = 0; index < enic->rq_count; index++) - enic->cq[index].cur_rx_coal_timeval = - enic->config.intr_timer_usec; - - rx_coal->use_adaptive_rx_coalesce = 1; -} - static int enic_dev_notify_set(struct enic *enic) { int err; @@ -3063,7 +3064,6 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) timer_setup(&enic->notify_timer, enic_notify_timer, 0); enic_rfs_flw_tbl_init(enic); - enic_set_rx_coal_setting(enic); INIT_WORK(&enic->reset, enic_reset); INIT_WORK(&enic->tx_hang_reset, enic_tx_hang_reset); INIT_WORK(&enic->change_mtu_work, enic_change_mtu_work); diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index f47f8177a93b..88510f822759 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -1434,7 +1434,7 @@ static void ugeth_activate(struct ucc_geth_private *ugeth) /* allow to xmit again */ netif_tx_wake_all_queues(ugeth->ndev); - __netdev_watchdog_up(ugeth->ndev); + netdev_watchdog_up(ugeth->ndev); } /* Initialize TBI PHY interface for communicating with the diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 8a8f6ab12a98..533e659b15b3 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -2241,14 +2241,18 @@ static void gve_service_task(struct work_struct *work) static void gve_set_netdev_xdp_features(struct gve_priv *priv) { + xdp_features_t xdp_features; + if (priv->queue_format == GVE_GQI_QPL_FORMAT) { - priv->dev->xdp_features = NETDEV_XDP_ACT_BASIC; - priv->dev->xdp_features |= NETDEV_XDP_ACT_REDIRECT; - priv->dev->xdp_features |= NETDEV_XDP_ACT_NDO_XMIT; - priv->dev->xdp_features |= NETDEV_XDP_ACT_XSK_ZEROCOPY; + xdp_features = NETDEV_XDP_ACT_BASIC; + xdp_features |= NETDEV_XDP_ACT_REDIRECT; + xdp_features |= NETDEV_XDP_ACT_NDO_XMIT; + xdp_features |= NETDEV_XDP_ACT_XSK_ZEROCOPY; } else { - priv->dev->xdp_features = 0; + xdp_features = 0; } + + xdp_set_features_flag(priv->dev, xdp_features); } static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 710a8f9f2248..12ba380eb701 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -916,9 +916,6 @@ struct hnae3_handle { u8 netdev_flags; struct dentry *hnae3_dbgfs; - /* protects concurrent contention between debugfs commands */ - struct mutex dbgfs_lock; - char **dbgfs_buf; /* Network interface message level enabled bits */ u32 msg_enable; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index 807eb3bbb11c..9bbece25552b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -1260,69 +1260,55 @@ static int hns3_dbg_read_cmd(struct hns3_dbg_data *dbg_data, static ssize_t hns3_dbg_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { - struct hns3_dbg_data *dbg_data = filp->private_data; + char *buf = filp->private_data; + + return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); +} + +static int hns3_dbg_open(struct inode *inode, struct file *filp) +{ + struct hns3_dbg_data *dbg_data = inode->i_private; struct hnae3_handle *handle = dbg_data->handle; struct hns3_nic_priv *priv = handle->priv; - ssize_t size = 0; - char **save_buf; - char *read_buf; u32 index; + char *buf; int ret; + if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state) || + test_bit(HNS3_NIC_STATE_RESETTING, &priv->state)) + return -EBUSY; + ret = hns3_dbg_get_cmd_index(dbg_data, &index); if (ret) return ret; - mutex_lock(&handle->dbgfs_lock); - save_buf = &handle->dbgfs_buf[index]; - - if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state) || - test_bit(HNS3_NIC_STATE_RESETTING, &priv->state)) { - ret = -EBUSY; - goto out; - } - - if (*save_buf) { - read_buf = *save_buf; - } else { - read_buf = kvzalloc(hns3_dbg_cmd[index].buf_len, GFP_KERNEL); - if (!read_buf) { - ret = -ENOMEM; - goto out; - } - - /* save the buffer addr until the last read operation */ - *save_buf = read_buf; - - /* get data ready for the first time to read */ - ret = hns3_dbg_read_cmd(dbg_data, hns3_dbg_cmd[index].cmd, - read_buf, hns3_dbg_cmd[index].buf_len); - if (ret) - goto out; - } + buf = kvzalloc(hns3_dbg_cmd[index].buf_len, GFP_KERNEL); + if (!buf) + return -ENOMEM; - size = simple_read_from_buffer(buffer, count, ppos, read_buf, - strlen(read_buf)); - if (size > 0) { - mutex_unlock(&handle->dbgfs_lock); - return size; + ret = hns3_dbg_read_cmd(dbg_data, hns3_dbg_cmd[index].cmd, + buf, hns3_dbg_cmd[index].buf_len); + if (ret) { + kvfree(buf); + return ret; } -out: - /* free the buffer for the last read operation */ - if (*save_buf) { - kvfree(*save_buf); - *save_buf = NULL; - } + filp->private_data = buf; + return 0; +} - mutex_unlock(&handle->dbgfs_lock); - return ret; +static int hns3_dbg_release(struct inode *inode, struct file *filp) +{ + kvfree(filp->private_data); + filp->private_data = NULL; + return 0; } static const struct file_operations hns3_dbg_fops = { .owner = THIS_MODULE, - .open = simple_open, + .open = hns3_dbg_open, .read = hns3_dbg_read, + .release = hns3_dbg_release, }; static int hns3_dbg_bd_file_init(struct hnae3_handle *handle, u32 cmd) @@ -1379,13 +1365,6 @@ int hns3_dbg_init(struct hnae3_handle *handle) int ret; u32 i; - handle->dbgfs_buf = devm_kcalloc(&handle->pdev->dev, - ARRAY_SIZE(hns3_dbg_cmd), - sizeof(*handle->dbgfs_buf), - GFP_KERNEL); - if (!handle->dbgfs_buf) - return -ENOMEM; - hns3_dbg_dentry[HNS3_DBG_DENTRY_COMMON].dentry = debugfs_create_dir(name, hns3_dbgfs_root); handle->hnae3_dbgfs = hns3_dbg_dentry[HNS3_DBG_DENTRY_COMMON].dentry; @@ -1395,8 +1374,6 @@ int hns3_dbg_init(struct hnae3_handle *handle) debugfs_create_dir(hns3_dbg_dentry[i].name, handle->hnae3_dbgfs); - mutex_init(&handle->dbgfs_lock); - for (i = 0; i < ARRAY_SIZE(hns3_dbg_cmd); i++) { if ((hns3_dbg_cmd[i].cmd == HNAE3_DBG_CMD_TM_NODES && ae_dev->dev_version <= HNAE3_DEVICE_VERSION_V2) || @@ -1425,24 +1402,13 @@ int hns3_dbg_init(struct hnae3_handle *handle) out: debugfs_remove_recursive(handle->hnae3_dbgfs); handle->hnae3_dbgfs = NULL; - mutex_destroy(&handle->dbgfs_lock); return ret; } void hns3_dbg_uninit(struct hnae3_handle *handle) { - u32 i; - debugfs_remove_recursive(handle->hnae3_dbgfs); handle->hnae3_dbgfs = NULL; - - for (i = 0; i < ARRAY_SIZE(hns3_dbg_cmd); i++) - if (handle->dbgfs_buf[i]) { - kvfree(handle->dbgfs_buf[i]); - handle->dbgfs_buf[i] = NULL; - } - - mutex_destroy(&handle->dbgfs_lock); } void hns3_dbg_register_debugfs(const char *debugfs_dir_name) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 43377a7b2426..a7e3b22f641c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -2452,7 +2452,6 @@ static int hns3_nic_set_features(struct net_device *netdev, return ret; } - netdev->features = features; return 0; } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 05942fa78b11..db7845009252 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -6,6 +6,7 @@ #include <linux/etherdevice.h> #include <linux/init.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/netdevice.h> @@ -3574,6 +3575,17 @@ static int hclge_set_vf_link_state(struct hnae3_handle *handle, int vf, return ret; } +static void hclge_set_reset_pending(struct hclge_dev *hdev, + enum hnae3_reset_type reset_type) +{ + /* When an incorrect reset type is executed, the get_reset_level + * function generates the HNAE3_NONE_RESET flag. As a result, this + * type do not need to pending. + */ + if (reset_type != HNAE3_NONE_RESET) + set_bit(reset_type, &hdev->reset_pending); +} + static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval) { u32 cmdq_src_reg, msix_src_reg, hw_err_src_reg; @@ -3594,7 +3606,7 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval) */ if (BIT(HCLGE_VECTOR0_IMPRESET_INT_B) & msix_src_reg) { dev_info(&hdev->pdev->dev, "IMP reset interrupt\n"); - set_bit(HNAE3_IMP_RESET, &hdev->reset_pending); + hclge_set_reset_pending(hdev, HNAE3_IMP_RESET); set_bit(HCLGE_COMM_STATE_CMD_DISABLE, &hdev->hw.hw.comm_state); *clearval = BIT(HCLGE_VECTOR0_IMPRESET_INT_B); hdev->rst_stats.imp_rst_cnt++; @@ -3604,7 +3616,7 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval) if (BIT(HCLGE_VECTOR0_GLOBALRESET_INT_B) & msix_src_reg) { dev_info(&hdev->pdev->dev, "global reset interrupt\n"); set_bit(HCLGE_COMM_STATE_CMD_DISABLE, &hdev->hw.hw.comm_state); - set_bit(HNAE3_GLOBAL_RESET, &hdev->reset_pending); + hclge_set_reset_pending(hdev, HNAE3_GLOBAL_RESET); *clearval = BIT(HCLGE_VECTOR0_GLOBALRESET_INT_B); hdev->rst_stats.global_rst_cnt++; return HCLGE_VECTOR0_EVENT_RST; @@ -3759,7 +3771,7 @@ static int hclge_misc_irq_init(struct hclge_dev *hdev) snprintf(hdev->misc_vector.name, HNAE3_INT_NAME_LEN, "%s-misc-%s", HCLGE_NAME, pci_name(hdev->pdev)); ret = request_irq(hdev->misc_vector.vector_irq, hclge_misc_irq_handle, - 0, hdev->misc_vector.name, hdev); + IRQF_NO_AUTOEN, hdev->misc_vector.name, hdev); if (ret) { hclge_free_vector(hdev, 0); dev_err(&hdev->pdev->dev, "request misc irq(%d) fail\n", @@ -4052,7 +4064,7 @@ static void hclge_do_reset(struct hclge_dev *hdev) case HNAE3_FUNC_RESET: dev_info(&pdev->dev, "PF reset requested\n"); /* schedule again to check later */ - set_bit(HNAE3_FUNC_RESET, &hdev->reset_pending); + hclge_set_reset_pending(hdev, HNAE3_FUNC_RESET); hclge_reset_task_schedule(hdev); break; default: @@ -4086,6 +4098,8 @@ static enum hnae3_reset_type hclge_get_reset_level(struct hnae3_ae_dev *ae_dev, clear_bit(HNAE3_FLR_RESET, addr); } + clear_bit(HNAE3_NONE_RESET, addr); + if (hdev->reset_type != HNAE3_NONE_RESET && rst_level < hdev->reset_type) return HNAE3_NONE_RESET; @@ -4227,7 +4241,7 @@ static bool hclge_reset_err_handle(struct hclge_dev *hdev) return false; } else if (hdev->rst_stats.reset_fail_cnt < MAX_RESET_FAIL_CNT) { hdev->rst_stats.reset_fail_cnt++; - set_bit(hdev->reset_type, &hdev->reset_pending); + hclge_set_reset_pending(hdev, hdev->reset_type); dev_info(&hdev->pdev->dev, "re-schedule reset task(%u)\n", hdev->rst_stats.reset_fail_cnt); @@ -4470,8 +4484,20 @@ static void hclge_reset_event(struct pci_dev *pdev, struct hnae3_handle *handle) static void hclge_set_def_reset_request(struct hnae3_ae_dev *ae_dev, enum hnae3_reset_type rst_type) { +#define HCLGE_SUPPORT_RESET_TYPE \ + (BIT(HNAE3_FLR_RESET) | BIT(HNAE3_FUNC_RESET) | \ + BIT(HNAE3_GLOBAL_RESET) | BIT(HNAE3_IMP_RESET)) + struct hclge_dev *hdev = ae_dev->priv; + if (!(BIT(rst_type) & HCLGE_SUPPORT_RESET_TYPE)) { + /* To prevent reset triggered by hclge_reset_event */ + set_bit(HNAE3_NONE_RESET, &hdev->default_reset_request); + dev_warn(&hdev->pdev->dev, "unsupported reset type %d\n", + rst_type); + return; + } + set_bit(rst_type, &hdev->default_reset_request); } @@ -11881,9 +11907,6 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) hclge_init_rxd_adv_layout(hdev); - /* Enable MISC vector(vector0) */ - hclge_enable_vector(&hdev->misc_vector, true); - ret = hclge_init_wol(hdev); if (ret) dev_warn(&pdev->dev, @@ -11896,6 +11919,10 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) hclge_state_init(hdev); hdev->last_reset_time = jiffies; + /* Enable MISC vector(vector0) */ + enable_irq(hdev->misc_vector.vector_irq); + hclge_enable_vector(&hdev->misc_vector, true); + dev_info(&hdev->pdev->dev, "%s driver initialization finished.\n", HCLGE_DRIVER_NAME); @@ -12301,7 +12328,7 @@ static void hclge_uninit_ae_dev(struct hnae3_ae_dev *ae_dev) /* Disable MISC vector(vector0) */ hclge_enable_vector(&hdev->misc_vector, false); - synchronize_irq(hdev->misc_vector.vector_irq); + disable_irq(hdev->misc_vector.vector_irq); /* Disable all hw interrupts */ hclge_config_mac_tnl_int(hdev, false); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c index 5505caea88e9..bab16c2191b2 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c @@ -58,6 +58,9 @@ bool hclge_ptp_set_tx_info(struct hnae3_handle *handle, struct sk_buff *skb) struct hclge_dev *hdev = vport->back; struct hclge_ptp *ptp = hdev->ptp; + if (!ptp) + return false; + if (!test_bit(HCLGE_PTP_FLAG_TX_EN, &ptp->flags) || test_and_set_bit(HCLGE_STATE_PTP_TX_HANDLING, &hdev->state)) { ptp->tx_skipped++; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_regs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_regs.c index 43c1c18fa81f..8c057192aae6 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_regs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_regs.c @@ -510,9 +510,9 @@ out: static int hclge_fetch_pf_reg(struct hclge_dev *hdev, void *data, struct hnae3_knic_private_info *kinfo) { -#define HCLGE_RING_REG_OFFSET 0x200 #define HCLGE_RING_INT_REG_OFFSET 0x4 + struct hnae3_queue *tqp; int i, j, reg_num; int data_num_sum; u32 *reg = data; @@ -533,10 +533,11 @@ static int hclge_fetch_pf_reg(struct hclge_dev *hdev, void *data, reg_num = ARRAY_SIZE(ring_reg_addr_list); for (j = 0; j < kinfo->num_tqps; j++) { reg += hclge_reg_get_tlv(HCLGE_REG_TAG_RING, reg_num, reg); + tqp = kinfo->tqp[j]; for (i = 0; i < reg_num; i++) - *reg++ = hclge_read_dev(&hdev->hw, - ring_reg_addr_list[i] + - HCLGE_RING_REG_OFFSET * j); + *reg++ = readl_relaxed(tqp->io_base - + HCLGE_TQP_REG_OFFSET + + ring_reg_addr_list[i]); } data_num_sum += (reg_num + HCLGE_REG_TLV_SPACE) * kinfo->num_tqps; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 2f6ffb88e700..163c6e59ea4c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -1393,6 +1393,17 @@ static int hclgevf_notify_roce_client(struct hclgevf_dev *hdev, return ret; } +static void hclgevf_set_reset_pending(struct hclgevf_dev *hdev, + enum hnae3_reset_type reset_type) +{ + /* When an incorrect reset type is executed, the get_reset_level + * function generates the HNAE3_NONE_RESET flag. As a result, this + * type do not need to pending. + */ + if (reset_type != HNAE3_NONE_RESET) + set_bit(reset_type, &hdev->reset_pending); +} + static int hclgevf_reset_wait(struct hclgevf_dev *hdev) { #define HCLGEVF_RESET_WAIT_US 20000 @@ -1542,7 +1553,7 @@ static void hclgevf_reset_err_handle(struct hclgevf_dev *hdev) hdev->rst_stats.rst_fail_cnt); if (hdev->rst_stats.rst_fail_cnt < HCLGEVF_RESET_MAX_FAIL_CNT) - set_bit(hdev->reset_type, &hdev->reset_pending); + hclgevf_set_reset_pending(hdev, hdev->reset_type); if (hclgevf_is_reset_pending(hdev)) { set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state); @@ -1662,6 +1673,8 @@ static enum hnae3_reset_type hclgevf_get_reset_level(unsigned long *addr) clear_bit(HNAE3_FLR_RESET, addr); } + clear_bit(HNAE3_NONE_RESET, addr); + return rst_level; } @@ -1671,14 +1684,15 @@ static void hclgevf_reset_event(struct pci_dev *pdev, struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); struct hclgevf_dev *hdev = ae_dev->priv; - dev_info(&hdev->pdev->dev, "received reset request from VF enet\n"); - if (hdev->default_reset_request) hdev->reset_level = hclgevf_get_reset_level(&hdev->default_reset_request); else hdev->reset_level = HNAE3_VF_FUNC_RESET; + dev_info(&hdev->pdev->dev, "received reset request from VF enet, reset level is %d\n", + hdev->reset_level); + /* reset of this VF requested */ set_bit(HCLGEVF_RESET_REQUESTED, &hdev->reset_state); hclgevf_reset_task_schedule(hdev); @@ -1689,8 +1703,20 @@ static void hclgevf_reset_event(struct pci_dev *pdev, static void hclgevf_set_def_reset_request(struct hnae3_ae_dev *ae_dev, enum hnae3_reset_type rst_type) { +#define HCLGEVF_SUPPORT_RESET_TYPE \ + (BIT(HNAE3_VF_RESET) | BIT(HNAE3_VF_FUNC_RESET) | \ + BIT(HNAE3_VF_PF_FUNC_RESET) | BIT(HNAE3_VF_FULL_RESET) | \ + BIT(HNAE3_FLR_RESET) | BIT(HNAE3_VF_EXP_RESET)) + struct hclgevf_dev *hdev = ae_dev->priv; + if (!(BIT(rst_type) & HCLGEVF_SUPPORT_RESET_TYPE)) { + /* To prevent reset triggered by hclge_reset_event */ + set_bit(HNAE3_NONE_RESET, &hdev->default_reset_request); + dev_info(&hdev->pdev->dev, "unsupported reset type %d\n", + rst_type); + return; + } set_bit(rst_type, &hdev->default_reset_request); } @@ -1847,14 +1873,14 @@ static void hclgevf_reset_service_task(struct hclgevf_dev *hdev) */ if (hdev->reset_attempts > HCLGEVF_MAX_RESET_ATTEMPTS_CNT) { /* prepare for full reset of stack + pcie interface */ - set_bit(HNAE3_VF_FULL_RESET, &hdev->reset_pending); + hclgevf_set_reset_pending(hdev, HNAE3_VF_FULL_RESET); /* "defer" schedule the reset task again */ set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state); } else { hdev->reset_attempts++; - set_bit(hdev->reset_level, &hdev->reset_pending); + hclgevf_set_reset_pending(hdev, hdev->reset_level); set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state); } hclgevf_reset_task_schedule(hdev); @@ -1977,7 +2003,7 @@ static enum hclgevf_evt_cause hclgevf_check_evt_cause(struct hclgevf_dev *hdev, rst_ing_reg = hclgevf_read_dev(&hdev->hw, HCLGEVF_RST_ING); dev_info(&hdev->pdev->dev, "receive reset interrupt 0x%x!\n", rst_ing_reg); - set_bit(HNAE3_VF_RESET, &hdev->reset_pending); + hclgevf_set_reset_pending(hdev, HNAE3_VF_RESET); set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state); set_bit(HCLGE_COMM_STATE_CMD_DISABLE, &hdev->hw.hw.comm_state); *clearval = ~(1U << HCLGEVF_VECTOR0_RST_INT_B); @@ -2287,6 +2313,8 @@ static void hclgevf_state_init(struct hclgevf_dev *hdev) clear_bit(HCLGEVF_STATE_RST_FAIL, &hdev->state); INIT_DELAYED_WORK(&hdev->service_task, hclgevf_service_task); + /* timer needs to be initialized before misc irq */ + timer_setup(&hdev->reset_timer, hclgevf_reset_timer, 0); mutex_init(&hdev->mbx_resp.mbx_mutex); sema_init(&hdev->reset_sem, 1); @@ -2986,7 +3014,6 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev) HCLGEVF_DRIVER_NAME); hclgevf_task_schedule(hdev, round_jiffies_relative(HZ)); - timer_setup(&hdev->reset_timer, hclgevf_reset_timer, 0); return 0; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_regs.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_regs.c index 6db415d8b917..7d9d9dbc7560 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_regs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_regs.c @@ -123,10 +123,10 @@ int hclgevf_get_regs_len(struct hnae3_handle *handle) void hclgevf_get_regs(struct hnae3_handle *handle, u32 *version, void *data) { -#define HCLGEVF_RING_REG_OFFSET 0x200 #define HCLGEVF_RING_INT_REG_OFFSET 0x4 struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle); + struct hnae3_queue *tqp; int i, j, reg_um; u32 *reg = data; @@ -147,10 +147,11 @@ void hclgevf_get_regs(struct hnae3_handle *handle, u32 *version, reg_um = ARRAY_SIZE(ring_reg_addr_list); for (j = 0; j < hdev->num_tqps; j++) { reg += hclgevf_reg_get_tlv(HCLGEVF_REG_TAG_RING, reg_um, reg); + tqp = &hdev->htqp[j].q; for (i = 0; i < reg_um; i++) - *reg++ = hclgevf_read_dev(&hdev->hw, - ring_reg_addr_list[i] + - HCLGEVF_RING_REG_OFFSET * j); + *reg++ = readl_relaxed(tqp->io_base - + HCLGEVF_TQP_REG_OFFSET + + ring_reg_addr_list[i]); } reg_um = ARRAY_SIZE(tqp_intr_reg_addr_list); diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c index 98861cc6df7c..b9dd7b719832 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c @@ -1180,126 +1180,6 @@ s32 fm10k_iov_select_vid(struct fm10k_vf_info *vf_info, u16 vid) } /** - * fm10k_iov_msg_mac_vlan_pf - Message handler for MAC/VLAN request from VF - * @hw: Pointer to hardware structure - * @results: Pointer array to message, results[0] is pointer to message - * @mbx: Pointer to mailbox information structure - * - * This function is a default handler for MAC/VLAN requests from the VF. - * The assumption is that in this case it is acceptable to just directly - * hand off the message from the VF to the underlying shared code. - **/ -s32 fm10k_iov_msg_mac_vlan_pf(struct fm10k_hw *hw, u32 **results, - struct fm10k_mbx_info *mbx) -{ - struct fm10k_vf_info *vf_info = (struct fm10k_vf_info *)mbx; - u8 mac[ETH_ALEN]; - u32 *result; - int err = 0; - bool set; - u16 vlan; - u32 vid; - - /* we shouldn't be updating rules on a disabled interface */ - if (!FM10K_VF_FLAG_ENABLED(vf_info)) - err = FM10K_ERR_PARAM; - - if (!err && !!results[FM10K_MAC_VLAN_MSG_VLAN]) { - result = results[FM10K_MAC_VLAN_MSG_VLAN]; - - /* record VLAN id requested */ - err = fm10k_tlv_attr_get_u32(result, &vid); - if (err) - return err; - - set = !(vid & FM10K_VLAN_CLEAR); - vid &= ~FM10K_VLAN_CLEAR; - - /* if the length field has been set, this is a multi-bit - * update request. For multi-bit requests, simply disallow - * them when the pf_vid has been set. In this case, the PF - * should have already cleared the VLAN_TABLE, and if we - * allowed them, it could allow a rogue VF to receive traffic - * on a VLAN it was not assigned. In the single-bit case, we - * need to modify requests for VLAN 0 to use the default PF or - * SW vid when assigned. - */ - - if (vid >> 16) { - /* prevent multi-bit requests when PF has - * administratively set the VLAN for this VF - */ - if (vf_info->pf_vid) - return FM10K_ERR_PARAM; - } else { - err = fm10k_iov_select_vid(vf_info, (u16)vid); - if (err < 0) - return err; - - vid = err; - } - - /* update VSI info for VF in regards to VLAN table */ - err = hw->mac.ops.update_vlan(hw, vid, vf_info->vsi, set); - } - - if (!err && !!results[FM10K_MAC_VLAN_MSG_MAC]) { - result = results[FM10K_MAC_VLAN_MSG_MAC]; - - /* record unicast MAC address requested */ - err = fm10k_tlv_attr_get_mac_vlan(result, mac, &vlan); - if (err) - return err; - - /* block attempts to set MAC for a locked device */ - if (is_valid_ether_addr(vf_info->mac) && - !ether_addr_equal(mac, vf_info->mac)) - return FM10K_ERR_PARAM; - - set = !(vlan & FM10K_VLAN_CLEAR); - vlan &= ~FM10K_VLAN_CLEAR; - - err = fm10k_iov_select_vid(vf_info, vlan); - if (err < 0) - return err; - - vlan = (u16)err; - - /* notify switch of request for new unicast address */ - err = hw->mac.ops.update_uc_addr(hw, vf_info->glort, - mac, vlan, set, 0); - } - - if (!err && !!results[FM10K_MAC_VLAN_MSG_MULTICAST]) { - result = results[FM10K_MAC_VLAN_MSG_MULTICAST]; - - /* record multicast MAC address requested */ - err = fm10k_tlv_attr_get_mac_vlan(result, mac, &vlan); - if (err) - return err; - - /* verify that the VF is allowed to request multicast */ - if (!(vf_info->vf_flags & FM10K_VF_FLAG_MULTI_ENABLED)) - return FM10K_ERR_PARAM; - - set = !(vlan & FM10K_VLAN_CLEAR); - vlan &= ~FM10K_VLAN_CLEAR; - - err = fm10k_iov_select_vid(vf_info, vlan); - if (err < 0) - return err; - - vlan = (u16)err; - - /* notify switch of request for new multicast address */ - err = hw->mac.ops.update_mc_addr(hw, vf_info->glort, - mac, vlan, set); - } - - return err; -} - -/** * fm10k_iov_supported_xcast_mode_pf - Determine best match for xcast mode * @vf_info: VF info structure containing capability flags * @mode: Requested xcast mode diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pf.h b/drivers/net/ethernet/intel/fm10k/fm10k_pf.h index 8e814df709d2..ad3696893cb1 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pf.h +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pf.h @@ -99,8 +99,6 @@ extern const struct fm10k_tlv_attr fm10k_err_msg_attr[]; s32 fm10k_iov_select_vid(struct fm10k_vf_info *vf_info, u16 vid); s32 fm10k_iov_msg_msix_pf(struct fm10k_hw *, u32 **, struct fm10k_mbx_info *); -s32 fm10k_iov_msg_mac_vlan_pf(struct fm10k_hw *, u32 **, - struct fm10k_mbx_info *); s32 fm10k_iov_msg_lport_state_pf(struct fm10k_hw *, u32 **, struct fm10k_mbx_info *); diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index d4255c2706fa..c67963bfe14e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -88,6 +88,7 @@ enum i40e_state { __I40E_SERVICE_SCHED, __I40E_ADMINQ_EVENT_PENDING, __I40E_MDD_EVENT_PENDING, + __I40E_MDD_VF_PRINT_PENDING, __I40E_VFLR_EVENT_PENDING, __I40E_RESET_RECOVERY_PENDING, __I40E_TIMEOUT_RECOVERY_PENDING, @@ -191,6 +192,7 @@ enum i40e_pf_flags { */ I40E_FLAG_TOTAL_PORT_SHUTDOWN_ENA, I40E_FLAG_VF_VLAN_PRUNING_ENA, + I40E_FLAG_MDD_AUTO_RESET_VF, I40E_PF_FLAGS_NBITS, /* must be last */ }; @@ -572,7 +574,7 @@ struct i40e_pf { int num_alloc_vfs; /* actual number of VFs allocated */ u32 vf_aq_requests; u32 arq_overflows; /* Not fatal, possibly indicative of problems */ - + struct ratelimit_state mdd_message_rate_limit; /* DCBx/DCBNL capability for PF that indicates * whether DCBx is managed by firmware or host * based agent (LLDPAD). Also, indicates what @@ -1189,7 +1191,6 @@ int i40e_add_del_fdir(struct i40e_vsi *vsi, struct i40e_fdir_filter *input, bool add); void i40e_fdir_check_and_reenable(struct i40e_pf *pf); u32 i40e_get_current_fd_count(struct i40e_pf *pf); -u32 i40e_get_cur_guaranteed_fd_count(struct i40e_pf *pf); u32 i40e_get_current_atr_cnt(struct i40e_pf *pf); u32 i40e_get_global_fd_count(struct i40e_pf *pf); bool i40e_set_ntuple(struct i40e_pf *pf, netdev_features_t features); @@ -1197,7 +1198,6 @@ void i40e_set_ethtool_ops(struct net_device *netdev); struct i40e_mac_filter *i40e_add_filter(struct i40e_vsi *vsi, const u8 *macaddr, s16 vlan); void __i40e_del_filter(struct i40e_vsi *vsi, struct i40e_mac_filter *f); -void i40e_del_filter(struct i40e_vsi *vsi, const u8 *macaddr, s16 vlan); int i40e_sync_vsi_filters(struct i40e_vsi *vsi); struct i40e_vsi *i40e_vsi_setup(struct i40e_pf *pf, u8 type, u16 uplink, u32 param1); @@ -1313,7 +1313,6 @@ int i40e_update_adq_vsi_queues(struct i40e_vsi *vsi, int vsi_offset); int i40e_is_vsi_uplink_mode_veb(struct i40e_vsi *vsi); int i40e_get_partition_bw_setting(struct i40e_pf *pf); int i40e_set_partition_bw_setting(struct i40e_pf *pf); -int i40e_commit_partition_bw_setting(struct i40e_pf *pf); void i40e_print_link_message(struct i40e_vsi *vsi, bool isup); void i40e_set_fec_in_flags(u8 fec_cfg, unsigned long *flags); diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.c b/drivers/net/ethernet/intel/i40e/i40e_adminq.c index f73f5930fc58..175c1320c143 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_adminq.c +++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.c @@ -1016,16 +1016,6 @@ i40e_asq_send_command_atomic_v2(struct i40e_hw *hw, return status; } -int -i40e_asq_send_command_v2(struct i40e_hw *hw, struct i40e_aq_desc *desc, - void *buff, /* can be NULL */ u16 buff_size, - struct i40e_asq_cmd_details *cmd_details, - enum i40e_admin_queue_err *aq_status) -{ - return i40e_asq_send_command_atomic_v2(hw, desc, buff, buff_size, - cmd_details, true, aq_status); -} - /** * i40e_fill_default_direct_cmd_desc - AQ descriptor helper function * @desc: pointer to the temp descriptor (non DMA mem) diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index e8031f1a9b4f..370b4bddee44 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -1805,37 +1805,6 @@ int i40e_aq_set_vsi_broadcast(struct i40e_hw *hw, } /** - * i40e_aq_set_vsi_vlan_promisc - control the VLAN promiscuous setting - * @hw: pointer to the hw struct - * @seid: vsi number - * @enable: set MAC L2 layer unicast promiscuous enable/disable for a given VLAN - * @cmd_details: pointer to command details structure or NULL - **/ -int i40e_aq_set_vsi_vlan_promisc(struct i40e_hw *hw, - u16 seid, bool enable, - struct i40e_asq_cmd_details *cmd_details) -{ - struct i40e_aq_desc desc; - struct i40e_aqc_set_vsi_promiscuous_modes *cmd = - (struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw; - u16 flags = 0; - int status; - - i40e_fill_default_direct_cmd_desc(&desc, - i40e_aqc_opc_set_vsi_promiscuous_modes); - if (enable) - flags |= I40E_AQC_SET_VSI_PROMISC_VLAN; - - cmd->promiscuous_flags = cpu_to_le16(flags); - cmd->valid_flags = cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_VLAN); - cmd->seid = cpu_to_le16(seid); - - status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); - - return status; -} - -/** * i40e_aq_get_vsi_params - get VSI configuration info * @hw: pointer to the hw struct * @vsi_ctx: pointer to a vsi context struct @@ -2436,136 +2405,6 @@ i40e_aq_remove_macvlan_v2(struct i40e_hw *hw, u16 seid, } /** - * i40e_mirrorrule_op - Internal helper function to add/delete mirror rule - * @hw: pointer to the hw struct - * @opcode: AQ opcode for add or delete mirror rule - * @sw_seid: Switch SEID (to which rule refers) - * @rule_type: Rule Type (ingress/egress/VLAN) - * @id: Destination VSI SEID or Rule ID - * @count: length of the list - * @mr_list: list of mirrored VSI SEIDs or VLAN IDs - * @cmd_details: pointer to command details structure or NULL - * @rule_id: Rule ID returned from FW - * @rules_used: Number of rules used in internal switch - * @rules_free: Number of rules free in internal switch - * - * Add/Delete a mirror rule to a specific switch. Mirror rules are supported for - * VEBs/VEPA elements only - **/ -static int i40e_mirrorrule_op(struct i40e_hw *hw, - u16 opcode, u16 sw_seid, u16 rule_type, u16 id, - u16 count, __le16 *mr_list, - struct i40e_asq_cmd_details *cmd_details, - u16 *rule_id, u16 *rules_used, u16 *rules_free) -{ - struct i40e_aq_desc desc; - struct i40e_aqc_add_delete_mirror_rule *cmd = - (struct i40e_aqc_add_delete_mirror_rule *)&desc.params.raw; - struct i40e_aqc_add_delete_mirror_rule_completion *resp = - (struct i40e_aqc_add_delete_mirror_rule_completion *)&desc.params.raw; - u16 buf_size; - int status; - - buf_size = count * sizeof(*mr_list); - - /* prep the rest of the request */ - i40e_fill_default_direct_cmd_desc(&desc, opcode); - cmd->seid = cpu_to_le16(sw_seid); - cmd->rule_type = cpu_to_le16(rule_type & - I40E_AQC_MIRROR_RULE_TYPE_MASK); - cmd->num_entries = cpu_to_le16(count); - /* Dest VSI for add, rule_id for delete */ - cmd->destination = cpu_to_le16(id); - if (mr_list) { - desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | - I40E_AQ_FLAG_RD)); - if (buf_size > I40E_AQ_LARGE_BUF) - desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB); - } - - status = i40e_asq_send_command(hw, &desc, mr_list, buf_size, - cmd_details); - if (!status || - hw->aq.asq_last_status == I40E_AQ_RC_ENOSPC) { - if (rule_id) - *rule_id = le16_to_cpu(resp->rule_id); - if (rules_used) - *rules_used = le16_to_cpu(resp->mirror_rules_used); - if (rules_free) - *rules_free = le16_to_cpu(resp->mirror_rules_free); - } - return status; -} - -/** - * i40e_aq_add_mirrorrule - add a mirror rule - * @hw: pointer to the hw struct - * @sw_seid: Switch SEID (to which rule refers) - * @rule_type: Rule Type (ingress/egress/VLAN) - * @dest_vsi: SEID of VSI to which packets will be mirrored - * @count: length of the list - * @mr_list: list of mirrored VSI SEIDs or VLAN IDs - * @cmd_details: pointer to command details structure or NULL - * @rule_id: Rule ID returned from FW - * @rules_used: Number of rules used in internal switch - * @rules_free: Number of rules free in internal switch - * - * Add mirror rule. Mirror rules are supported for VEBs or VEPA elements only - **/ -int i40e_aq_add_mirrorrule(struct i40e_hw *hw, u16 sw_seid, - u16 rule_type, u16 dest_vsi, u16 count, - __le16 *mr_list, - struct i40e_asq_cmd_details *cmd_details, - u16 *rule_id, u16 *rules_used, u16 *rules_free) -{ - if (!(rule_type == I40E_AQC_MIRROR_RULE_TYPE_ALL_INGRESS || - rule_type == I40E_AQC_MIRROR_RULE_TYPE_ALL_EGRESS)) { - if (count == 0 || !mr_list) - return -EINVAL; - } - - return i40e_mirrorrule_op(hw, i40e_aqc_opc_add_mirror_rule, sw_seid, - rule_type, dest_vsi, count, mr_list, - cmd_details, rule_id, rules_used, rules_free); -} - -/** - * i40e_aq_delete_mirrorrule - delete a mirror rule - * @hw: pointer to the hw struct - * @sw_seid: Switch SEID (to which rule refers) - * @rule_type: Rule Type (ingress/egress/VLAN) - * @count: length of the list - * @rule_id: Rule ID that is returned in the receive desc as part of - * add_mirrorrule. - * @mr_list: list of mirrored VLAN IDs to be removed - * @cmd_details: pointer to command details structure or NULL - * @rules_used: Number of rules used in internal switch - * @rules_free: Number of rules free in internal switch - * - * Delete a mirror rule. Mirror rules are supported for VEBs/VEPA elements only - **/ -int i40e_aq_delete_mirrorrule(struct i40e_hw *hw, u16 sw_seid, - u16 rule_type, u16 rule_id, u16 count, - __le16 *mr_list, - struct i40e_asq_cmd_details *cmd_details, - u16 *rules_used, u16 *rules_free) -{ - /* Rule ID has to be valid except rule_type: INGRESS VLAN mirroring */ - if (rule_type == I40E_AQC_MIRROR_RULE_TYPE_VLAN) { - /* count and mr_list shall be valid for rule_type INGRESS VLAN - * mirroring. For other rule_type, count and rule_type should - * not matter. - */ - if (count == 0 || !mr_list) - return -EINVAL; - } - - return i40e_mirrorrule_op(hw, i40e_aqc_opc_delete_mirror_rule, sw_seid, - rule_type, rule_id, count, mr_list, - cmd_details, NULL, rules_used, rules_free); -} - -/** * i40e_aq_send_msg_to_vf * @hw: pointer to the hardware structure * @vfid: VF id to send msg @@ -3180,41 +3019,6 @@ i40e_aq_update_nvm_exit: } /** - * i40e_aq_rearrange_nvm - * @hw: pointer to the hw struct - * @rearrange_nvm: defines direction of rearrangement - * @cmd_details: pointer to command details structure or NULL - * - * Rearrange NVM structure, available only for transition FW - **/ -int i40e_aq_rearrange_nvm(struct i40e_hw *hw, - u8 rearrange_nvm, - struct i40e_asq_cmd_details *cmd_details) -{ - struct i40e_aqc_nvm_update *cmd; - struct i40e_aq_desc desc; - int status; - - cmd = (struct i40e_aqc_nvm_update *)&desc.params.raw; - - i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_nvm_update); - - rearrange_nvm &= (I40E_AQ_NVM_REARRANGE_TO_FLAT | - I40E_AQ_NVM_REARRANGE_TO_STRUCT); - - if (!rearrange_nvm) { - status = -EINVAL; - goto i40e_aq_rearrange_nvm_exit; - } - - cmd->command_flags |= rearrange_nvm; - status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); - -i40e_aq_rearrange_nvm_exit: - return status; -} - -/** * i40e_aq_get_lldp_mib * @hw: pointer to the hw struct * @bridge_type: type of bridge requested @@ -3335,44 +3139,6 @@ int i40e_aq_cfg_lldp_mib_change_event(struct i40e_hw *hw, } /** - * i40e_aq_restore_lldp - * @hw: pointer to the hw struct - * @setting: pointer to factory setting variable or NULL - * @restore: True if factory settings should be restored - * @cmd_details: pointer to command details structure or NULL - * - * Restore LLDP Agent factory settings if @restore set to True. In other case - * only returns factory setting in AQ response. - **/ -int -i40e_aq_restore_lldp(struct i40e_hw *hw, u8 *setting, bool restore, - struct i40e_asq_cmd_details *cmd_details) -{ - struct i40e_aq_desc desc; - struct i40e_aqc_lldp_restore *cmd = - (struct i40e_aqc_lldp_restore *)&desc.params.raw; - int status; - - if (!test_bit(I40E_HW_CAP_FW_LLDP_PERSISTENT, hw->caps)) { - i40e_debug(hw, I40E_DEBUG_ALL, - "Restore LLDP not supported by current FW version.\n"); - return -ENODEV; - } - - i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_restore); - - if (restore) - cmd->command |= I40E_AQ_LLDP_AGENT_RESTORE; - - status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); - - if (setting) - *setting = cmd->command & 1; - - return status; -} - -/** * i40e_aq_stop_lldp * @hw: pointer to the hw struct * @shutdown_agent: True if LLDP Agent needs to be Shutdown @@ -4570,84 +4336,6 @@ phy_write_end: } /** - * i40e_write_phy_register - * @hw: pointer to the HW structure - * @page: registers page number - * @reg: register address in the page - * @phy_addr: PHY address on MDIO interface - * @value: PHY register value - * - * Writes value to specified PHY register - **/ -int i40e_write_phy_register(struct i40e_hw *hw, - u8 page, u16 reg, u8 phy_addr, u16 value) -{ - int status; - - switch (hw->device_id) { - case I40E_DEV_ID_1G_BASE_T_X722: - status = i40e_write_phy_register_clause22(hw, reg, phy_addr, - value); - break; - case I40E_DEV_ID_1G_BASE_T_BC: - case I40E_DEV_ID_5G_BASE_T_BC: - case I40E_DEV_ID_10G_BASE_T: - case I40E_DEV_ID_10G_BASE_T4: - case I40E_DEV_ID_10G_BASE_T_BC: - case I40E_DEV_ID_10G_BASE_T_X722: - case I40E_DEV_ID_25G_B: - case I40E_DEV_ID_25G_SFP28: - status = i40e_write_phy_register_clause45(hw, page, reg, - phy_addr, value); - break; - default: - status = -EIO; - break; - } - - return status; -} - -/** - * i40e_read_phy_register - * @hw: pointer to the HW structure - * @page: registers page number - * @reg: register address in the page - * @phy_addr: PHY address on MDIO interface - * @value: PHY register value - * - * Reads specified PHY register value - **/ -int i40e_read_phy_register(struct i40e_hw *hw, - u8 page, u16 reg, u8 phy_addr, u16 *value) -{ - int status; - - switch (hw->device_id) { - case I40E_DEV_ID_1G_BASE_T_X722: - status = i40e_read_phy_register_clause22(hw, reg, phy_addr, - value); - break; - case I40E_DEV_ID_1G_BASE_T_BC: - case I40E_DEV_ID_5G_BASE_T_BC: - case I40E_DEV_ID_10G_BASE_T: - case I40E_DEV_ID_10G_BASE_T4: - case I40E_DEV_ID_10G_BASE_T_BC: - case I40E_DEV_ID_10G_BASE_T_X722: - case I40E_DEV_ID_25G_B: - case I40E_DEV_ID_25G_SFP28: - status = i40e_read_phy_register_clause45(hw, page, reg, - phy_addr, value); - break; - default: - status = -EIO; - break; - } - - return status; -} - -/** * i40e_get_phy_address * @hw: pointer to the HW structure * @dev_num: PHY port num that address we want @@ -4663,80 +4351,6 @@ u8 i40e_get_phy_address(struct i40e_hw *hw, u8 dev_num) } /** - * i40e_blink_phy_link_led - * @hw: pointer to the HW structure - * @time: time how long led will blinks in secs - * @interval: gap between LED on and off in msecs - * - * Blinks PHY link LED - **/ -int i40e_blink_phy_link_led(struct i40e_hw *hw, - u32 time, u32 interval) -{ - u16 led_addr = I40E_PHY_LED_PROV_REG_1; - u16 gpio_led_port; - u8 phy_addr = 0; - int status = 0; - u16 led_ctl; - u8 port_num; - u16 led_reg; - u32 i; - - i = rd32(hw, I40E_PFGEN_PORTNUM); - port_num = (u8)(i & I40E_PFGEN_PORTNUM_PORT_NUM_MASK); - phy_addr = i40e_get_phy_address(hw, port_num); - - for (gpio_led_port = 0; gpio_led_port < 3; gpio_led_port++, - led_addr++) { - status = i40e_read_phy_register_clause45(hw, - I40E_PHY_COM_REG_PAGE, - led_addr, phy_addr, - &led_reg); - if (status) - goto phy_blinking_end; - led_ctl = led_reg; - if (led_reg & I40E_PHY_LED_LINK_MODE_MASK) { - led_reg = 0; - status = i40e_write_phy_register_clause45(hw, - I40E_PHY_COM_REG_PAGE, - led_addr, phy_addr, - led_reg); - if (status) - goto phy_blinking_end; - break; - } - } - - if (time > 0 && interval > 0) { - for (i = 0; i < time * 1000; i += interval) { - status = i40e_read_phy_register_clause45(hw, - I40E_PHY_COM_REG_PAGE, - led_addr, phy_addr, &led_reg); - if (status) - goto restore_config; - if (led_reg & I40E_PHY_LED_MANUAL_ON) - led_reg = 0; - else - led_reg = I40E_PHY_LED_MANUAL_ON; - status = i40e_write_phy_register_clause45(hw, - I40E_PHY_COM_REG_PAGE, - led_addr, phy_addr, led_reg); - if (status) - goto restore_config; - msleep(interval); - } - } - -restore_config: - status = i40e_write_phy_register_clause45(hw, - I40E_PHY_COM_REG_PAGE, - led_addr, phy_addr, led_ctl); - -phy_blinking_end: - return status; -} - -/** * i40e_led_get_reg - read LED register * @hw: pointer to the HW structure * @led_addr: LED register address @@ -5269,39 +4883,6 @@ i40e_find_segment_in_package(u32 segment_type, (struct i40e_profile_section_header *)((u8 *)(profile) + (offset)) /** - * i40e_find_section_in_profile - * @section_type: the section type to search for (i.e., SECTION_TYPE_NOTE) - * @profile: pointer to the i40e segment header to be searched - * - * This function searches i40e segment for a particular section type. On - * success it returns a pointer to the section header, otherwise it will - * return NULL. - **/ -struct i40e_profile_section_header * -i40e_find_section_in_profile(u32 section_type, - struct i40e_profile_segment *profile) -{ - struct i40e_profile_section_header *sec; - struct i40e_section_table *sec_tbl; - u32 sec_off; - u32 i; - - if (profile->header.type != SEGMENT_TYPE_I40E) - return NULL; - - I40E_SECTION_TABLE(profile, sec_tbl); - - for (i = 0; i < sec_tbl->section_count; i++) { - sec_off = sec_tbl->section_offset[i]; - sec = I40E_SECTION_HEADER(profile, sec_off); - if (sec->section.type == section_type) - return sec; - } - - return NULL; -} - -/** * i40e_ddp_exec_aq_section - Execute generic AQ for DDP * @hw: pointer to the hw struct * @aq: command buffer containing all data to execute AQ @@ -5524,45 +5105,6 @@ i40e_rollback_profile(struct i40e_hw *hw, struct i40e_profile_segment *profile, } /** - * i40e_add_pinfo_to_list - * @hw: pointer to the hardware structure - * @profile: pointer to the profile segment of the package - * @profile_info_sec: buffer for information section - * @track_id: package tracking id - * - * Register a profile to the list of loaded profiles. - */ -int -i40e_add_pinfo_to_list(struct i40e_hw *hw, - struct i40e_profile_segment *profile, - u8 *profile_info_sec, u32 track_id) -{ - struct i40e_profile_section_header *sec = NULL; - struct i40e_profile_info *pinfo; - u32 offset = 0, info = 0; - int status = 0; - - sec = (struct i40e_profile_section_header *)profile_info_sec; - sec->tbl_size = 1; - sec->data_end = sizeof(struct i40e_profile_section_header) + - sizeof(struct i40e_profile_info); - sec->section.type = SECTION_TYPE_INFO; - sec->section.offset = sizeof(struct i40e_profile_section_header); - sec->section.size = sizeof(struct i40e_profile_info); - pinfo = (struct i40e_profile_info *)(profile_info_sec + - sec->section.offset); - pinfo->track_id = track_id; - pinfo->version = profile->version; - pinfo->op = I40E_DDP_ADD_TRACKID; - memcpy(pinfo->name, profile->name, I40E_DDP_NAME_SIZE); - - status = i40e_aq_write_ddp(hw, (void *)sec, sec->data_end, - track_id, &offset, &info, NULL); - - return status; -} - -/** * i40e_aq_add_cloud_filters * @hw: pointer to the hardware structure * @seid: VSI seid to add cloud filters from diff --git a/drivers/net/ethernet/intel/i40e/i40e_dcb.c b/drivers/net/ethernet/intel/i40e/i40e_dcb.c index 8db1eb0c1768..352e957443fd 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_dcb.c +++ b/drivers/net/ethernet/intel/i40e/i40e_dcb.c @@ -1491,19 +1491,6 @@ void i40e_dcb_hw_set_num_tc(struct i40e_hw *hw, u8 num_tc) } /** - * i40e_dcb_hw_get_num_tc - * @hw: pointer to the hw struct - * - * Returns number of traffic classes configured in HW - **/ -u8 i40e_dcb_hw_get_num_tc(struct i40e_hw *hw) -{ - u32 reg = rd32(hw, I40E_PRTDCB_GENC); - - return FIELD_GET(I40E_PRTDCB_GENC_NUMTC_MASK, reg); -} - -/** * i40e_dcb_hw_rx_ets_bw_config * @hw: pointer to the hw struct * @bw_share: Bandwidth share indexed per traffic class diff --git a/drivers/net/ethernet/intel/i40e/i40e_dcb.h b/drivers/net/ethernet/intel/i40e/i40e_dcb.h index d76497566e40..d5662c639c41 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_dcb.h +++ b/drivers/net/ethernet/intel/i40e/i40e_dcb.h @@ -253,7 +253,6 @@ void i40e_dcb_hw_rx_cmd_monitor_config(struct i40e_hw *hw, void i40e_dcb_hw_pfc_config(struct i40e_hw *hw, u8 pfc_en, u8 *prio_tc); void i40e_dcb_hw_set_num_tc(struct i40e_hw *hw, u8 num_tc); -u8 i40e_dcb_hw_get_num_tc(struct i40e_hw *hw); void i40e_dcb_hw_rx_ets_bw_config(struct i40e_hw *hw, u8 *bw_share, u8 *mode, u8 *prio_type); void i40e_dcb_hw_rx_up2tc_config(struct i40e_hw *hw, u8 *prio_tc); diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c index 208c2f0857b6..6cd9da662ae1 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c +++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c @@ -722,7 +722,7 @@ static void i40e_dbg_dump_vf(struct i40e_pf *pf, int vf_id) dev_info(&pf->pdev->dev, "vf %2d: VSI id=%d, seid=%d, qps=%d\n", vf_id, vf->lan_vsi_id, vsi->seid, vf->num_queue_pairs); dev_info(&pf->pdev->dev, " num MDD=%lld\n", - vf->num_mdd_events); + vf->mdd_tx_events.count + vf->mdd_rx_events.count); } else { dev_info(&pf->pdev->dev, "invalid VF id %d\n", vf_id); } diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index bce5b76f1e7a..8a7a83f83ee5 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -459,6 +459,8 @@ static const struct i40e_priv_flags i40e_gstrings_priv_flags[] = { I40E_PRIV_FLAG("base-r-fec", I40E_FLAG_BASE_R_FEC, 0), I40E_PRIV_FLAG("vf-vlan-pruning", I40E_FLAG_VF_VLAN_PRUNING_ENA, 0), + I40E_PRIV_FLAG("mdd-auto-reset-vf", + I40E_FLAG_MDD_AUTO_RESET_VF, 0), }; #define I40E_PRIV_FLAGS_STR_LEN ARRAY_SIZE(i40e_gstrings_priv_flags) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 0e1d9e2fbf38..65a702668e21 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1666,9 +1666,8 @@ struct i40e_mac_filter *i40e_add_filter(struct i40e_vsi *vsi, * @vsi: VSI to remove from * @f: the filter to remove from the list * - * This function should be called instead of i40e_del_filter only if you know - * the exact filter you will remove already, such as via i40e_find_filter or - * i40e_find_mac. + * This function requires you've found * the exact filter you will remove + * already, such as via i40e_find_filter or i40e_find_mac. * * NOTE: This function is expected to be called with mac_filter_hash_lock * being held. @@ -1698,29 +1697,6 @@ void __i40e_del_filter(struct i40e_vsi *vsi, struct i40e_mac_filter *f) } /** - * i40e_del_filter - Remove a MAC/VLAN filter from the VSI - * @vsi: the VSI to be searched - * @macaddr: the MAC address - * @vlan: the VLAN - * - * NOTE: This function is expected to be called with mac_filter_hash_lock - * being held. - * ANOTHER NOTE: This function MUST be called from within the context of - * the "safe" variants of any list iterators, e.g. list_for_each_entry_safe() - * instead of list_for_each_entry(). - **/ -void i40e_del_filter(struct i40e_vsi *vsi, const u8 *macaddr, s16 vlan) -{ - struct i40e_mac_filter *f; - - if (!vsi || !macaddr) - return; - - f = i40e_find_filter(vsi, macaddr, vlan); - __i40e_del_filter(vsi, f); -} - -/** * i40e_add_mac_filter - Add a MAC filter for all active VLANs * @vsi: the VSI to be searched * @macaddr: the mac address to be filtered @@ -9629,19 +9605,6 @@ static void i40e_handle_lan_overflow_event(struct i40e_pf *pf, } /** - * i40e_get_cur_guaranteed_fd_count - Get the consumed guaranteed FD filters - * @pf: board private structure - **/ -u32 i40e_get_cur_guaranteed_fd_count(struct i40e_pf *pf) -{ - u32 val, fcnt_prog; - - val = rd32(&pf->hw, I40E_PFQF_FDSTAT); - fcnt_prog = (val & I40E_PFQF_FDSTAT_GUARANT_CNT_MASK); - return fcnt_prog; -} - -/** * i40e_get_current_fd_count - Get total FD filters programmed for this PF * @pf: board private structure **/ @@ -11217,6 +11180,67 @@ static void i40e_handle_reset_warning(struct i40e_pf *pf, bool lock_acquired) } /** + * i40e_print_vf_mdd_event - print VF Tx/Rx malicious driver detect event + * @pf: board private structure + * @vf: pointer to the VF structure + * @is_tx: true - for Tx event, false - for Rx + */ +static void i40e_print_vf_mdd_event(struct i40e_pf *pf, struct i40e_vf *vf, + bool is_tx) +{ + dev_err(&pf->pdev->dev, is_tx ? + "%lld Tx Malicious Driver Detection events detected on PF %d VF %d MAC %pm. mdd-auto-reset-vfs=%s\n" : + "%lld Rx Malicious Driver Detection events detected on PF %d VF %d MAC %pm. mdd-auto-reset-vfs=%s\n", + is_tx ? vf->mdd_tx_events.count : vf->mdd_rx_events.count, + pf->hw.pf_id, + vf->vf_id, + vf->default_lan_addr.addr, + str_on_off(test_bit(I40E_FLAG_MDD_AUTO_RESET_VF, pf->flags))); +} + +/** + * i40e_print_vfs_mdd_events - print VFs malicious driver detect event + * @pf: pointer to the PF structure + * + * Called from i40e_handle_mdd_event to rate limit and print VFs MDD events. + */ +static void i40e_print_vfs_mdd_events(struct i40e_pf *pf) +{ + unsigned int i; + + /* check that there are pending MDD events to print */ + if (!test_and_clear_bit(__I40E_MDD_VF_PRINT_PENDING, pf->state)) + return; + + if (!__ratelimit(&pf->mdd_message_rate_limit)) + return; + + for (i = 0; i < pf->num_alloc_vfs; i++) { + struct i40e_vf *vf = &pf->vf[i]; + bool is_printed = false; + + /* only print Rx MDD event message if there are new events */ + if (vf->mdd_rx_events.count != vf->mdd_rx_events.last_printed) { + vf->mdd_rx_events.last_printed = vf->mdd_rx_events.count; + i40e_print_vf_mdd_event(pf, vf, false); + is_printed = true; + } + + /* only print Tx MDD event message if there are new events */ + if (vf->mdd_tx_events.count != vf->mdd_tx_events.last_printed) { + vf->mdd_tx_events.last_printed = vf->mdd_tx_events.count; + i40e_print_vf_mdd_event(pf, vf, true); + is_printed = true; + } + + if (is_printed && !test_bit(I40E_FLAG_MDD_AUTO_RESET_VF, pf->flags)) + dev_info(&pf->pdev->dev, + "Use PF Control I/F to re-enable the VF #%d\n", + i); + } +} + +/** * i40e_handle_mdd_event * @pf: pointer to the PF structure * @@ -11230,8 +11254,13 @@ static void i40e_handle_mdd_event(struct i40e_pf *pf) u32 reg; int i; - if (!test_bit(__I40E_MDD_EVENT_PENDING, pf->state)) + if (!test_and_clear_bit(__I40E_MDD_EVENT_PENDING, pf->state)) { + /* Since the VF MDD event logging is rate limited, check if + * there are pending MDD events. + */ + i40e_print_vfs_mdd_events(pf); return; + } /* find what triggered the MDD event */ reg = rd32(hw, I40E_GL_MDET_TX); @@ -11275,36 +11304,48 @@ static void i40e_handle_mdd_event(struct i40e_pf *pf) /* see if one of the VFs needs its hand slapped */ for (i = 0; i < pf->num_alloc_vfs && mdd_detected; i++) { + bool is_mdd_on_tx = false; + bool is_mdd_on_rx = false; + vf = &(pf->vf[i]); reg = rd32(hw, I40E_VP_MDET_TX(i)); if (reg & I40E_VP_MDET_TX_VALID_MASK) { + set_bit(__I40E_MDD_VF_PRINT_PENDING, pf->state); wr32(hw, I40E_VP_MDET_TX(i), 0xFFFF); - vf->num_mdd_events++; - dev_info(&pf->pdev->dev, "TX driver issue detected on VF %d\n", - i); - dev_info(&pf->pdev->dev, - "Use PF Control I/F to re-enable the VF\n"); + vf->mdd_tx_events.count++; set_bit(I40E_VF_STATE_DISABLED, &vf->vf_states); + is_mdd_on_tx = true; } reg = rd32(hw, I40E_VP_MDET_RX(i)); if (reg & I40E_VP_MDET_RX_VALID_MASK) { + set_bit(__I40E_MDD_VF_PRINT_PENDING, pf->state); wr32(hw, I40E_VP_MDET_RX(i), 0xFFFF); - vf->num_mdd_events++; - dev_info(&pf->pdev->dev, "RX driver issue detected on VF %d\n", - i); - dev_info(&pf->pdev->dev, - "Use PF Control I/F to re-enable the VF\n"); + vf->mdd_rx_events.count++; set_bit(I40E_VF_STATE_DISABLED, &vf->vf_states); + is_mdd_on_rx = true; + } + + if ((is_mdd_on_tx || is_mdd_on_rx) && + test_bit(I40E_FLAG_MDD_AUTO_RESET_VF, pf->flags)) { + /* VF MDD event counters will be cleared by + * reset, so print the event prior to reset. + */ + if (is_mdd_on_rx) + i40e_print_vf_mdd_event(pf, vf, false); + if (is_mdd_on_tx) + i40e_print_vf_mdd_event(pf, vf, true); + + i40e_vc_reset_vf(vf, true); } } - /* re-enable mdd interrupt cause */ - clear_bit(__I40E_MDD_EVENT_PENDING, pf->state); reg = rd32(hw, I40E_PFINT_ICR0_ENA); reg |= I40E_PFINT_ICR0_ENA_MAL_DETECT_MASK; wr32(hw, I40E_PFINT_ICR0_ENA, reg); i40e_flush(hw); + + i40e_print_vfs_mdd_events(pf); } /** @@ -12614,89 +12655,6 @@ int i40e_set_partition_bw_setting(struct i40e_pf *pf) } /** - * i40e_commit_partition_bw_setting - Commit BW settings for this PF partition - * @pf: board private structure - **/ -int i40e_commit_partition_bw_setting(struct i40e_pf *pf) -{ - /* Commit temporary BW setting to permanent NVM image */ - enum i40e_admin_queue_err last_aq_status; - u16 nvm_word; - int ret; - - if (pf->hw.partition_id != 1) { - dev_info(&pf->pdev->dev, - "Commit BW only works on partition 1! This is partition %d", - pf->hw.partition_id); - ret = -EOPNOTSUPP; - goto bw_commit_out; - } - - /* Acquire NVM for read access */ - ret = i40e_acquire_nvm(&pf->hw, I40E_RESOURCE_READ); - last_aq_status = pf->hw.aq.asq_last_status; - if (ret) { - dev_info(&pf->pdev->dev, - "Cannot acquire NVM for read access, err %pe aq_err %s\n", - ERR_PTR(ret), - i40e_aq_str(&pf->hw, last_aq_status)); - goto bw_commit_out; - } - - /* Read word 0x10 of NVM - SW compatibility word 1 */ - ret = i40e_aq_read_nvm(&pf->hw, - I40E_SR_NVM_CONTROL_WORD, - 0x10, sizeof(nvm_word), &nvm_word, - false, NULL); - /* Save off last admin queue command status before releasing - * the NVM - */ - last_aq_status = pf->hw.aq.asq_last_status; - i40e_release_nvm(&pf->hw); - if (ret) { - dev_info(&pf->pdev->dev, "NVM read error, err %pe aq_err %s\n", - ERR_PTR(ret), - i40e_aq_str(&pf->hw, last_aq_status)); - goto bw_commit_out; - } - - /* Wait a bit for NVM release to complete */ - msleep(50); - - /* Acquire NVM for write access */ - ret = i40e_acquire_nvm(&pf->hw, I40E_RESOURCE_WRITE); - last_aq_status = pf->hw.aq.asq_last_status; - if (ret) { - dev_info(&pf->pdev->dev, - "Cannot acquire NVM for write access, err %pe aq_err %s\n", - ERR_PTR(ret), - i40e_aq_str(&pf->hw, last_aq_status)); - goto bw_commit_out; - } - /* Write it back out unchanged to initiate update NVM, - * which will force a write of the shadow (alt) RAM to - * the NVM - thus storing the bandwidth values permanently. - */ - ret = i40e_aq_update_nvm(&pf->hw, - I40E_SR_NVM_CONTROL_WORD, - 0x10, sizeof(nvm_word), - &nvm_word, true, 0, NULL); - /* Save off last admin queue command status before releasing - * the NVM - */ - last_aq_status = pf->hw.aq.asq_last_status; - i40e_release_nvm(&pf->hw); - if (ret) - dev_info(&pf->pdev->dev, - "BW settings NOT SAVED, err %pe aq_err %s\n", - ERR_PTR(ret), - i40e_aq_str(&pf->hw, last_aq_status)); -bw_commit_out: - - return ret; -} - -/** * i40e_is_total_port_shutdown_enabled - read NVM and return value * if total port shutdown feature is enabled for this PF * @pf: board private structure @@ -15998,6 +15956,9 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) ERR_PTR(err), i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status)); + /* VF MDD event logs are rate limited to one second intervals */ + ratelimit_state_init(&pf->mdd_message_rate_limit, 1 * HZ, 1); + /* Reconfigure hardware for allowing smaller MSS in the case * of TSO, so that we avoid the MDD being fired and causing * a reset in the case of small MSS+TSO. diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h index 5a0699ca7ce5..099bb8ab7d70 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h +++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h @@ -27,13 +27,6 @@ i40e_asq_send_command(struct i40e_hw *hw, struct i40e_aq_desc *desc, void *buff, /* can be NULL */ u16 buff_size, struct i40e_asq_cmd_details *cmd_details); int -i40e_asq_send_command_v2(struct i40e_hw *hw, - struct i40e_aq_desc *desc, - void *buff, /* can be NULL */ - u16 buff_size, - struct i40e_asq_cmd_details *cmd_details, - enum i40e_admin_queue_err *aq_status); -int i40e_asq_send_command_atomic(struct i40e_hw *hw, struct i40e_aq_desc *desc, void *buff, /* can be NULL */ u16 buff_size, struct i40e_asq_cmd_details *cmd_details, @@ -72,8 +65,6 @@ int i40e_led_set_phy(struct i40e_hw *hw, bool on, u16 led_addr, u32 mode); int i40e_led_get_phy(struct i40e_hw *hw, u16 *led_addr, u16 *val); -int i40e_blink_phy_link_led(struct i40e_hw *hw, - u32 time, u32 interval); /* admin send queue commands */ @@ -141,9 +132,6 @@ int i40e_aq_set_vsi_uc_promisc_on_vlan(struct i40e_hw *hw, int i40e_aq_set_vsi_bc_promisc_on_vlan(struct i40e_hw *hw, u16 seid, bool enable, u16 vid, struct i40e_asq_cmd_details *cmd_details); -int i40e_aq_set_vsi_vlan_promisc(struct i40e_hw *hw, - u16 seid, bool enable, - struct i40e_asq_cmd_details *cmd_details); int i40e_aq_get_vsi_params(struct i40e_hw *hw, struct i40e_vsi_context *vsi_ctx, struct i40e_asq_cmd_details *cmd_details); @@ -176,14 +164,6 @@ i40e_aq_remove_macvlan_v2(struct i40e_hw *hw, u16 seid, struct i40e_aqc_remove_macvlan_element_data *mv_list, u16 count, struct i40e_asq_cmd_details *cmd_details, enum i40e_admin_queue_err *aq_status); -int i40e_aq_add_mirrorrule(struct i40e_hw *hw, u16 sw_seid, - u16 rule_type, u16 dest_vsi, u16 count, __le16 *mr_list, - struct i40e_asq_cmd_details *cmd_details, - u16 *rule_id, u16 *rules_used, u16 *rules_free); -int i40e_aq_delete_mirrorrule(struct i40e_hw *hw, u16 sw_seid, - u16 rule_type, u16 rule_id, u16 count, __le16 *mr_list, - struct i40e_asq_cmd_details *cmd_details, - u16 *rules_used, u16 *rules_free); int i40e_aq_send_msg_to_vf(struct i40e_hw *hw, u16 vfid, u32 v_opcode, u32 v_retval, u8 *msg, u16 msglen, @@ -220,9 +200,6 @@ int i40e_aq_update_nvm(struct i40e_hw *hw, u8 module_pointer, u32 offset, u16 length, void *data, bool last_command, u8 preservation_flags, struct i40e_asq_cmd_details *cmd_details); -int i40e_aq_rearrange_nvm(struct i40e_hw *hw, - u8 rearrange_nvm, - struct i40e_asq_cmd_details *cmd_details); int i40e_aq_get_lldp_mib(struct i40e_hw *hw, u8 bridge_type, u8 mib_type, void *buff, u16 buff_size, u16 *local_len, u16 *remote_len, @@ -234,9 +211,6 @@ i40e_aq_set_lldp_mib(struct i40e_hw *hw, int i40e_aq_cfg_lldp_mib_change_event(struct i40e_hw *hw, bool enable_update, struct i40e_asq_cmd_details *cmd_details); -int -i40e_aq_restore_lldp(struct i40e_hw *hw, u8 *setting, bool restore, - struct i40e_asq_cmd_details *cmd_details); int i40e_aq_stop_lldp(struct i40e_hw *hw, bool shutdown_agent, bool persist, struct i40e_asq_cmd_details *cmd_details); @@ -458,13 +432,7 @@ int i40e_read_phy_register_clause45(struct i40e_hw *hw, u8 page, u16 reg, u8 phy_addr, u16 *value); int i40e_write_phy_register_clause45(struct i40e_hw *hw, u8 page, u16 reg, u8 phy_addr, u16 value); -int i40e_read_phy_register(struct i40e_hw *hw, u8 page, u16 reg, - u8 phy_addr, u16 *value); -int i40e_write_phy_register(struct i40e_hw *hw, u8 page, u16 reg, - u8 phy_addr, u16 value); u8 i40e_get_phy_address(struct i40e_hw *hw, u8 dev_num); -int i40e_blink_phy_link_led(struct i40e_hw *hw, - u32 time, u32 interval); int i40e_aq_write_ddp(struct i40e_hw *hw, void *buff, u16 buff_size, u32 track_id, u32 *error_offset, u32 *error_info, @@ -477,20 +445,12 @@ int i40e_aq_get_ddp_list(struct i40e_hw *hw, void *buff, struct i40e_generic_seg_header * i40e_find_segment_in_package(u32 segment_type, struct i40e_package_header *pkg_header); -struct i40e_profile_section_header * -i40e_find_section_in_profile(u32 section_type, - struct i40e_profile_segment *profile); int i40e_write_profile(struct i40e_hw *hw, struct i40e_profile_segment *i40e_seg, u32 track_id); int i40e_rollback_profile(struct i40e_hw *hw, struct i40e_profile_segment *i40e_seg, u32 track_id); -int -i40e_add_pinfo_to_list(struct i40e_hw *hw, - struct i40e_profile_segment *profile, - u8 *profile_info_sec, u32 track_id); - /* i40e_ddp */ int i40e_ddp_flash(struct net_device *netdev, struct ethtool_flash *flash); diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index dfa785e39458..1120f8e4bb67 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -216,7 +216,7 @@ void i40e_vc_notify_vf_reset(struct i40e_vf *vf) * @notify_vf: notify vf about reset or not * Reset VF handler. **/ -static void i40e_vc_reset_vf(struct i40e_vf *vf, bool notify_vf) +void i40e_vc_reset_vf(struct i40e_vf *vf, bool notify_vf) { struct i40e_pf *pf = vf->pf; int i; diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h index 66f95e2f3146..5cf74f16f433 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h @@ -64,6 +64,12 @@ struct i40evf_channel { u64 max_tx_rate; /* bandwidth rate allocation for VSIs */ }; +struct i40e_mdd_vf_events { + u64 count; /* total count of Rx|Tx events */ + /* count number of the last printed event */ + u64 last_printed; +}; + /* VF information structure */ struct i40e_vf { struct i40e_pf *pf; @@ -92,7 +98,9 @@ struct i40e_vf { u8 num_queue_pairs; /* num of qps assigned to VF vsis */ u8 num_req_queues; /* num of requested qps */ - u64 num_mdd_events; /* num of mdd events detected */ + /* num of mdd tx and rx events detected */ + struct i40e_mdd_vf_events mdd_rx_events; + struct i40e_mdd_vf_events mdd_tx_events; unsigned long vf_caps; /* vf's adv. capabilities */ unsigned long vf_states; /* vf's runtime states */ @@ -120,6 +128,7 @@ int i40e_alloc_vfs(struct i40e_pf *pf, u16 num_alloc_vfs); int i40e_vc_process_vf_msg(struct i40e_pf *pf, s16 vf_id, u32 v_opcode, u32 v_retval, u8 *msg, u16 msglen); int i40e_vc_process_vflr_event(struct i40e_pf *pf); +void i40e_vc_reset_vf(struct i40e_vf *vf, bool notify_vf); bool i40e_reset_vf(struct i40e_vf *vf, bool flr); bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr); void i40e_vc_notify_vf_reset(struct i40e_vf *vf); diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index a9e54866ae6b..7740f446c73f 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -1968,6 +1968,7 @@ err: static void iavf_finish_config(struct work_struct *work) { struct iavf_adapter *adapter; + bool netdev_released = false; int pairs, err; adapter = container_of(work, struct iavf_adapter, finish_config); @@ -1988,7 +1989,16 @@ static void iavf_finish_config(struct work_struct *work) switch (adapter->state) { case __IAVF_DOWN: + /* Set the real number of queues when reset occurs while + * state == __IAVF_DOWN + */ + pairs = adapter->num_active_queues; + netif_set_real_num_rx_queues(adapter->netdev, pairs); + netif_set_real_num_tx_queues(adapter->netdev, pairs); + if (adapter->netdev->reg_state != NETREG_REGISTERED) { + mutex_unlock(&adapter->netdev->lock); + netdev_released = true; err = register_netdevice(adapter->netdev); if (err) { dev_err(&adapter->pdev->dev, "Unable to register netdev (%d)\n", @@ -2003,11 +2013,7 @@ static void iavf_finish_config(struct work_struct *work) goto out; } } - - /* Set the real number of queues when reset occurs while - * state == __IAVF_DOWN - */ - fallthrough; + break; case __IAVF_RUNNING: pairs = adapter->num_active_queues; netif_set_real_num_rx_queues(adapter->netdev, pairs); @@ -2020,7 +2026,8 @@ static void iavf_finish_config(struct work_struct *work) out: mutex_unlock(&adapter->crit_lock); - mutex_unlock(&adapter->netdev->lock); + if (!netdev_released) + mutex_unlock(&adapter->netdev->lock); rtnl_unlock(); } @@ -2713,12 +2720,16 @@ static void iavf_watchdog_task(struct work_struct *work) struct iavf_adapter *adapter = container_of(work, struct iavf_adapter, watchdog_task.work); + struct net_device *netdev = adapter->netdev; struct iavf_hw *hw = &adapter->hw; u32 reg_val; + mutex_lock(&netdev->lock); if (!mutex_trylock(&adapter->crit_lock)) { - if (adapter->state == __IAVF_REMOVE) + if (adapter->state == __IAVF_REMOVE) { + mutex_unlock(&netdev->lock); return; + } goto restart_watchdog; } @@ -2730,30 +2741,35 @@ static void iavf_watchdog_task(struct work_struct *work) case __IAVF_STARTUP: iavf_startup(adapter); mutex_unlock(&adapter->crit_lock); + mutex_unlock(&netdev->lock); queue_delayed_work(adapter->wq, &adapter->watchdog_task, msecs_to_jiffies(30)); return; case __IAVF_INIT_VERSION_CHECK: iavf_init_version_check(adapter); mutex_unlock(&adapter->crit_lock); + mutex_unlock(&netdev->lock); queue_delayed_work(adapter->wq, &adapter->watchdog_task, msecs_to_jiffies(30)); return; case __IAVF_INIT_GET_RESOURCES: iavf_init_get_resources(adapter); mutex_unlock(&adapter->crit_lock); + mutex_unlock(&netdev->lock); queue_delayed_work(adapter->wq, &adapter->watchdog_task, msecs_to_jiffies(1)); return; case __IAVF_INIT_EXTENDED_CAPS: iavf_init_process_extended_caps(adapter); mutex_unlock(&adapter->crit_lock); + mutex_unlock(&netdev->lock); queue_delayed_work(adapter->wq, &adapter->watchdog_task, msecs_to_jiffies(1)); return; case __IAVF_INIT_CONFIG_ADAPTER: iavf_init_config_adapter(adapter); mutex_unlock(&adapter->crit_lock); + mutex_unlock(&netdev->lock); queue_delayed_work(adapter->wq, &adapter->watchdog_task, msecs_to_jiffies(1)); return; @@ -2765,6 +2781,7 @@ static void iavf_watchdog_task(struct work_struct *work) * as it can loop forever */ mutex_unlock(&adapter->crit_lock); + mutex_unlock(&netdev->lock); return; } if (++adapter->aq_wait_count > IAVF_AQ_MAX_ERR) { @@ -2773,6 +2790,7 @@ static void iavf_watchdog_task(struct work_struct *work) adapter->flags |= IAVF_FLAG_PF_COMMS_FAILED; iavf_shutdown_adminq(hw); mutex_unlock(&adapter->crit_lock); + mutex_unlock(&netdev->lock); queue_delayed_work(adapter->wq, &adapter->watchdog_task, (5 * HZ)); return; @@ -2780,6 +2798,7 @@ static void iavf_watchdog_task(struct work_struct *work) /* Try again from failed step*/ iavf_change_state(adapter, adapter->last_state); mutex_unlock(&adapter->crit_lock); + mutex_unlock(&netdev->lock); queue_delayed_work(adapter->wq, &adapter->watchdog_task, HZ); return; case __IAVF_COMM_FAILED: @@ -2792,6 +2811,7 @@ static void iavf_watchdog_task(struct work_struct *work) iavf_change_state(adapter, __IAVF_INIT_FAILED); adapter->flags &= ~IAVF_FLAG_PF_COMMS_FAILED; mutex_unlock(&adapter->crit_lock); + mutex_unlock(&netdev->lock); return; } reg_val = rd32(hw, IAVF_VFGEN_RSTAT) & @@ -2811,12 +2831,14 @@ static void iavf_watchdog_task(struct work_struct *work) adapter->aq_required = 0; adapter->current_op = VIRTCHNL_OP_UNKNOWN; mutex_unlock(&adapter->crit_lock); + mutex_unlock(&netdev->lock); queue_delayed_work(adapter->wq, &adapter->watchdog_task, msecs_to_jiffies(10)); return; case __IAVF_RESETTING: mutex_unlock(&adapter->crit_lock); + mutex_unlock(&netdev->lock); queue_delayed_work(adapter->wq, &adapter->watchdog_task, HZ * 2); return; @@ -2847,6 +2869,7 @@ static void iavf_watchdog_task(struct work_struct *work) case __IAVF_REMOVE: default: mutex_unlock(&adapter->crit_lock); + mutex_unlock(&netdev->lock); return; } @@ -2858,12 +2881,14 @@ static void iavf_watchdog_task(struct work_struct *work) dev_err(&adapter->pdev->dev, "Hardware reset detected\n"); iavf_schedule_reset(adapter, IAVF_FLAG_RESET_PENDING); mutex_unlock(&adapter->crit_lock); + mutex_unlock(&netdev->lock); queue_delayed_work(adapter->wq, &adapter->watchdog_task, HZ * 2); return; } mutex_unlock(&adapter->crit_lock); + mutex_unlock(&netdev->lock); restart_watchdog: if (adapter->state >= __IAVF_DOWN) queue_work(adapter->wq, &adapter->adminq_task); @@ -4340,14 +4365,17 @@ static int iavf_open(struct net_device *netdev) return -EIO; } + mutex_lock(&netdev->lock); while (!mutex_trylock(&adapter->crit_lock)) { /* If we are in __IAVF_INIT_CONFIG_ADAPTER state the crit_lock * is already taken and iavf_open is called from an upper * device's notifier reacting on NETDEV_REGISTER event. * We have to leave here to avoid dead lock. */ - if (adapter->state == __IAVF_INIT_CONFIG_ADAPTER) + if (adapter->state == __IAVF_INIT_CONFIG_ADAPTER) { + mutex_unlock(&netdev->lock); return -EBUSY; + } usleep_range(500, 1000); } @@ -4396,6 +4424,7 @@ static int iavf_open(struct net_device *netdev) iavf_irq_enable(adapter, true); mutex_unlock(&adapter->crit_lock); + mutex_unlock(&netdev->lock); return 0; @@ -4408,6 +4437,7 @@ err_setup_tx: iavf_free_all_tx_resources(adapter); err_unlock: mutex_unlock(&adapter->crit_lock); + mutex_unlock(&netdev->lock); return err; } @@ -4429,10 +4459,12 @@ static int iavf_close(struct net_device *netdev) u64 aq_to_restore; int status; + mutex_lock(&netdev->lock); mutex_lock(&adapter->crit_lock); if (adapter->state <= __IAVF_DOWN_PENDING) { mutex_unlock(&adapter->crit_lock); + mutex_unlock(&netdev->lock); return 0; } @@ -4466,6 +4498,7 @@ static int iavf_close(struct net_device *netdev) iavf_free_traffic_irqs(adapter); mutex_unlock(&adapter->crit_lock); + mutex_unlock(&netdev->lock); /* We explicitly don't free resources here because the hardware is * still active and can DMA into memory. Resources are cleared in @@ -5342,6 +5375,7 @@ static int iavf_suspend(struct device *dev_d) netif_device_detach(netdev); + mutex_lock(&netdev->lock); mutex_lock(&adapter->crit_lock); if (netif_running(netdev)) { @@ -5353,6 +5387,7 @@ static int iavf_suspend(struct device *dev_d) iavf_reset_interrupt_capability(adapter); mutex_unlock(&adapter->crit_lock); + mutex_unlock(&netdev->lock); return 0; } @@ -5451,6 +5486,7 @@ static void iavf_remove(struct pci_dev *pdev) if (netdev->reg_state == NETREG_REGISTERED) unregister_netdev(netdev); + mutex_lock(&netdev->lock); mutex_lock(&adapter->crit_lock); dev_info(&adapter->pdev->dev, "Removing device\n"); iavf_change_state(adapter, __IAVF_REMOVE); @@ -5487,6 +5523,7 @@ static void iavf_remove(struct pci_dev *pdev) mutex_destroy(&hw->aq.asq_mutex); mutex_unlock(&adapter->crit_lock); mutex_destroy(&adapter->crit_lock); + mutex_unlock(&netdev->lock); iounmap(hw->hw_addr); pci_release_regions(pdev); diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h index 3bf05b135b35..73f5fddf3ee9 100644 --- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h +++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h @@ -2271,6 +2271,8 @@ struct ice_aqc_get_pkg_info_resp { struct ice_aqc_get_pkg_info pkg_info[]; }; +#define ICE_AQC_GET_CGU_MAX_PHASE_ADJ GENMASK(30, 0) + /* Get CGU abilities command response data structure (indirect 0x0C61) */ struct ice_aqc_get_cgu_abilities { u8 num_inputs; diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c index d5ad6d84007c..38e151c7ea23 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.c +++ b/drivers/net/ethernet/intel/ice/ice_dpll.c @@ -2065,6 +2065,18 @@ static int ice_dpll_init_worker(struct ice_pf *pf) } /** + * ice_dpll_phase_range_set - initialize phase adjust range helper + * @range: pointer to phase adjust range struct to be initialized + * @phase_adj: a value to be used as min(-)/max(+) boundary + */ +static void ice_dpll_phase_range_set(struct dpll_pin_phase_adjust_range *range, + u32 phase_adj) +{ + range->min = -phase_adj; + range->max = phase_adj; +} + +/** * ice_dpll_init_info_pins_generic - initializes generic pins info * @pf: board private structure * @input: if input pins initialized @@ -2105,8 +2117,8 @@ static int ice_dpll_init_info_pins_generic(struct ice_pf *pf, bool input) for (i = 0; i < pin_num; i++) { pins[i].idx = i; pins[i].prop.board_label = labels[i]; - pins[i].prop.phase_range.min = phase_adj_max; - pins[i].prop.phase_range.max = -phase_adj_max; + ice_dpll_phase_range_set(&pins[i].prop.phase_range, + phase_adj_max); pins[i].prop.capabilities = cap; pins[i].pf = pf; ret = ice_dpll_pin_state_update(pf, &pins[i], pin_type, NULL); @@ -2152,6 +2164,7 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf, struct ice_hw *hw = &pf->hw; struct ice_dpll_pin *pins; unsigned long caps; + u32 phase_adj_max; u8 freq_supp_num; bool input; @@ -2159,11 +2172,13 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf, case ICE_DPLL_PIN_TYPE_INPUT: pins = pf->dplls.inputs; num_pins = pf->dplls.num_inputs; + phase_adj_max = pf->dplls.input_phase_adj_max; input = true; break; case ICE_DPLL_PIN_TYPE_OUTPUT: pins = pf->dplls.outputs; num_pins = pf->dplls.num_outputs; + phase_adj_max = pf->dplls.output_phase_adj_max; input = false; break; default: @@ -2188,19 +2203,13 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf, return ret; caps |= (DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE | DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE); - pins[i].prop.phase_range.min = - pf->dplls.input_phase_adj_max; - pins[i].prop.phase_range.max = - -pf->dplls.input_phase_adj_max; } else { - pins[i].prop.phase_range.min = - pf->dplls.output_phase_adj_max; - pins[i].prop.phase_range.max = - -pf->dplls.output_phase_adj_max; ret = ice_cgu_get_output_pin_state_caps(hw, i, &caps); if (ret) return ret; } + ice_dpll_phase_range_set(&pins[i].prop.phase_range, + phase_adj_max); pins[i].prop.capabilities = caps; ret = ice_dpll_pin_state_update(pf, &pins[i], pin_type, NULL); if (ret) @@ -2308,8 +2317,10 @@ static int ice_dpll_init_info(struct ice_pf *pf, bool cgu) dp->dpll_idx = abilities.pps_dpll_idx; d->num_inputs = abilities.num_inputs; d->num_outputs = abilities.num_outputs; - d->input_phase_adj_max = le32_to_cpu(abilities.max_in_phase_adj); - d->output_phase_adj_max = le32_to_cpu(abilities.max_out_phase_adj); + d->input_phase_adj_max = le32_to_cpu(abilities.max_in_phase_adj) & + ICE_AQC_GET_CGU_MAX_PHASE_ADJ; + d->output_phase_adj_max = le32_to_cpu(abilities.max_out_phase_adj) & + ICE_AQC_GET_CGU_MAX_PHASE_ADJ; alloc_size = sizeof(*d->inputs) * d->num_inputs; d->inputs = kzalloc(alloc_size, GFP_KERNEL); diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_consts.h b/drivers/net/ethernet/intel/ice/ice_ptp_consts.h index 585ce200c60f..d75f0eddd631 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp_consts.h +++ b/drivers/net/ethernet/intel/ice/ice_ptp_consts.h @@ -761,9 +761,9 @@ const struct ice_vernier_info_e82x e822_vernier[NUM_ICE_PTP_LNK_SPD] = { /* rx_desk_rsgb_par */ 644531250, /* 644.53125 MHz Reed Solomon gearbox */ /* tx_desk_rsgb_pcs */ - 644531250, /* 644.53125 MHz Reed Solomon gearbox */ + 390625000, /* 390.625 MHz Reed Solomon gearbox */ /* rx_desk_rsgb_pcs */ - 644531250, /* 644.53125 MHz Reed Solomon gearbox */ + 390625000, /* 390.625 MHz Reed Solomon gearbox */ /* tx_fixed_delay */ 1620, /* pmd_adj_divisor */ diff --git a/drivers/net/ethernet/intel/igb/Makefile b/drivers/net/ethernet/intel/igb/Makefile index 463c0d26b9d4..6c1b702fd992 100644 --- a/drivers/net/ethernet/intel/igb/Makefile +++ b/drivers/net/ethernet/intel/igb/Makefile @@ -8,4 +8,4 @@ obj-$(CONFIG_IGB) += igb.o igb-y := igb_main.o igb_ethtool.o e1000_82575.o \ e1000_mac.o e1000_nvm.o e1000_phy.o e1000_mbx.o \ - e1000_i210.o igb_ptp.o igb_hwmon.o + e1000_i210.o igb_ptp.o igb_hwmon.o igb_xsk.o diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index 3c2dc7bdebb5..02f340280d20 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -18,8 +18,10 @@ #include <linux/i2c-algo-bit.h> #include <linux/pci.h> #include <linux/mdio.h> +#include <linux/lockdep.h> #include <net/xdp.h> +#include <net/xdp_sock_drv.h> struct igb_adapter; @@ -86,6 +88,7 @@ struct igb_adapter; #define IGB_XDP_CONSUMED BIT(0) #define IGB_XDP_TX BIT(1) #define IGB_XDP_REDIR BIT(2) +#define IGB_XDP_EXIT BIT(3) struct vf_data_storage { unsigned char vf_mac_addresses[ETH_ALEN]; @@ -255,6 +258,7 @@ enum igb_tx_flags { enum igb_tx_buf_type { IGB_TYPE_SKB = 0, IGB_TYPE_XDP, + IGB_TYPE_XSK }; /* wrapper around a pointer to a socket buffer, @@ -320,6 +324,7 @@ struct igb_ring { union { /* array of buffer info structs */ struct igb_tx_buffer *tx_buffer_info; struct igb_rx_buffer *rx_buffer_info; + struct xdp_buff **rx_buffer_info_zc; }; void *desc; /* descriptor ring memory */ unsigned long flags; /* ring specific flags */ @@ -357,6 +362,7 @@ struct igb_ring { }; }; struct xdp_rxq_info xdp_rxq; + struct xsk_buff_pool *xsk_pool; } ____cacheline_internodealigned_in_smp; struct igb_q_vector { @@ -384,7 +390,8 @@ enum e1000_ring_flags_t { IGB_RING_FLAG_RX_SCTP_CSUM, IGB_RING_FLAG_RX_LB_VLAN_BSWAP, IGB_RING_FLAG_TX_CTX_IDX, - IGB_RING_FLAG_TX_DETECT_HANG + IGB_RING_FLAG_TX_DETECT_HANG, + IGB_RING_FLAG_TX_DISABLED }; #define ring_uses_large_buffer(ring) \ @@ -731,12 +738,21 @@ int igb_setup_tx_resources(struct igb_ring *); int igb_setup_rx_resources(struct igb_ring *); void igb_free_tx_resources(struct igb_ring *); void igb_free_rx_resources(struct igb_ring *); +void igb_clean_tx_ring(struct igb_ring *tx_ring); +void igb_clean_rx_ring(struct igb_ring *rx_ring); void igb_configure_tx_ring(struct igb_adapter *, struct igb_ring *); void igb_configure_rx_ring(struct igb_adapter *, struct igb_ring *); +void igb_finalize_xdp(struct igb_adapter *adapter, unsigned int status); +void igb_update_rx_stats(struct igb_q_vector *q_vector, unsigned int packets, + unsigned int bytes); void igb_setup_tctl(struct igb_adapter *); void igb_setup_rctl(struct igb_adapter *); void igb_setup_srrctl(struct igb_adapter *, struct igb_ring *); netdev_tx_t igb_xmit_frame_ring(struct sk_buff *, struct igb_ring *); +int igb_xdp_xmit_back(struct igb_adapter *adapter, struct xdp_buff *xdp); +void igb_process_skb_fields(struct igb_ring *rx_ring, + union e1000_adv_rx_desc *rx_desc, + struct sk_buff *skb); void igb_alloc_rx_buffers(struct igb_ring *, u16); void igb_update_stats(struct igb_adapter *); bool igb_has_link(struct igb_adapter *adapter); @@ -797,6 +813,33 @@ static inline struct netdev_queue *txring_txq(const struct igb_ring *tx_ring) return netdev_get_tx_queue(tx_ring->netdev, tx_ring->queue_index); } +/* This function assumes __netif_tx_lock is held by the caller. */ +static inline void igb_xdp_ring_update_tail(struct igb_ring *ring) +{ + lockdep_assert_held(&txring_txq(ring)->_xmit_lock); + + /* Force memory writes to complete before letting h/w know there + * are new descriptors to fetch. + */ + wmb(); + writel(ring->next_to_use, ring->tail); +} + +static inline struct igb_ring *igb_xdp_tx_queue_mapping(struct igb_adapter *adapter) +{ + unsigned int r_idx = smp_processor_id(); + + if (r_idx >= adapter->num_tx_queues) + r_idx = r_idx % adapter->num_tx_queues; + + return adapter->tx_ring[r_idx]; +} + +static inline bool igb_xdp_is_enabled(struct igb_adapter *adapter) +{ + return !!READ_ONCE(adapter->xdp_prog); +} + int igb_add_filter(struct igb_adapter *adapter, struct igb_nfc_filter *input); int igb_erase_filter(struct igb_adapter *adapter, @@ -807,4 +850,17 @@ int igb_add_mac_steering_filter(struct igb_adapter *adapter, int igb_del_mac_steering_filter(struct igb_adapter *adapter, const u8 *addr, u8 queue, u8 flags); +struct xsk_buff_pool *igb_xsk_pool(struct igb_adapter *adapter, + struct igb_ring *ring); +int igb_xsk_pool_setup(struct igb_adapter *adapter, + struct xsk_buff_pool *pool, + u16 qid); +bool igb_alloc_rx_buffers_zc(struct igb_ring *rx_ring, + struct xsk_buff_pool *xsk_pool, u16 count); +void igb_clean_rx_ring_zc(struct igb_ring *rx_ring); +int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector, + struct xsk_buff_pool *xsk_pool, const int budget); +bool igb_xmit_zc(struct igb_ring *tx_ring, struct xsk_buff_pool *xsk_pool); +int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags); + #endif /* _IGB_H_ */ diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 288a4bb2683a..d368b753a467 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -33,7 +33,6 @@ #include <linux/bpf_trace.h> #include <linux/pm_runtime.h> #include <linux/etherdevice.h> -#include <linux/lockdep.h> #ifdef CONFIG_IGB_DCA #include <linux/dca.h> #endif @@ -116,8 +115,6 @@ static void igb_configure_tx(struct igb_adapter *); static void igb_configure_rx(struct igb_adapter *); static void igb_clean_all_tx_rings(struct igb_adapter *); static void igb_clean_all_rx_rings(struct igb_adapter *); -static void igb_clean_tx_ring(struct igb_ring *); -static void igb_clean_rx_ring(struct igb_ring *); static void igb_set_rx_mode(struct net_device *); static void igb_update_phy_info(struct timer_list *); static void igb_watchdog(struct timer_list *); @@ -475,12 +472,17 @@ rx_ring_summary: for (i = 0; i < rx_ring->count; i++) { const char *next_desc; - struct igb_rx_buffer *buffer_info; - buffer_info = &rx_ring->rx_buffer_info[i]; + dma_addr_t dma = (dma_addr_t)0; + struct igb_rx_buffer *buffer_info = NULL; rx_desc = IGB_RX_DESC(rx_ring, i); u0 = (struct my_u0 *)rx_desc; staterr = le32_to_cpu(rx_desc->wb.upper.status_error); + if (!rx_ring->xsk_pool) { + buffer_info = &rx_ring->rx_buffer_info[i]; + dma = buffer_info->dma; + } + if (i == rx_ring->next_to_use) next_desc = " NTU"; else if (i == rx_ring->next_to_clean) @@ -500,11 +502,11 @@ rx_ring_summary: "R ", i, le64_to_cpu(u0->a), le64_to_cpu(u0->b), - (u64)buffer_info->dma, + (u64)dma, next_desc); if (netif_msg_pktdata(adapter) && - buffer_info->dma && buffer_info->page) { + buffer_info && dma && buffer_info->page) { print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 1, @@ -1990,7 +1992,11 @@ static void igb_configure(struct igb_adapter *adapter) */ for (i = 0; i < adapter->num_rx_queues; i++) { struct igb_ring *ring = adapter->rx_ring[i]; - igb_alloc_rx_buffers(ring, igb_desc_unused(ring)); + if (ring->xsk_pool) + igb_alloc_rx_buffers_zc(ring, ring->xsk_pool, + igb_desc_unused(ring)); + else + igb_alloc_rx_buffers(ring, igb_desc_unused(ring)); } } @@ -2911,37 +2917,20 @@ static int igb_xdp_setup(struct net_device *dev, struct netdev_bpf *bpf) static int igb_xdp(struct net_device *dev, struct netdev_bpf *xdp) { + struct igb_adapter *adapter = netdev_priv(dev); + switch (xdp->command) { case XDP_SETUP_PROG: return igb_xdp_setup(dev, xdp); + case XDP_SETUP_XSK_POOL: + return igb_xsk_pool_setup(adapter, xdp->xsk.pool, + xdp->xsk.queue_id); default: return -EINVAL; } } -/* This function assumes __netif_tx_lock is held by the caller. */ -static void igb_xdp_ring_update_tail(struct igb_ring *ring) -{ - lockdep_assert_held(&txring_txq(ring)->_xmit_lock); - - /* Force memory writes to complete before letting h/w know there - * are new descriptors to fetch. - */ - wmb(); - writel(ring->next_to_use, ring->tail); -} - -static struct igb_ring *igb_xdp_tx_queue_mapping(struct igb_adapter *adapter) -{ - unsigned int r_idx = smp_processor_id(); - - if (r_idx >= adapter->num_tx_queues) - r_idx = r_idx % adapter->num_tx_queues; - - return adapter->tx_ring[r_idx]; -} - -static int igb_xdp_xmit_back(struct igb_adapter *adapter, struct xdp_buff *xdp) +int igb_xdp_xmit_back(struct igb_adapter *adapter, struct xdp_buff *xdp) { struct xdp_frame *xdpf = xdp_convert_buff_to_frame(xdp); int cpu = smp_processor_id(); @@ -2955,7 +2944,8 @@ static int igb_xdp_xmit_back(struct igb_adapter *adapter, struct xdp_buff *xdp) /* During program transitions its possible adapter->xdp_prog is assigned * but ring has not been configured yet. In this case simply abort xmit. */ - tx_ring = adapter->xdp_prog ? igb_xdp_tx_queue_mapping(adapter) : NULL; + tx_ring = igb_xdp_is_enabled(adapter) ? + igb_xdp_tx_queue_mapping(adapter) : NULL; if (unlikely(!tx_ring)) return IGB_XDP_CONSUMED; @@ -2988,10 +2978,14 @@ static int igb_xdp_xmit(struct net_device *dev, int n, /* During program transitions its possible adapter->xdp_prog is assigned * but ring has not been configured yet. In this case simply abort xmit. */ - tx_ring = adapter->xdp_prog ? igb_xdp_tx_queue_mapping(adapter) : NULL; + tx_ring = igb_xdp_is_enabled(adapter) ? + igb_xdp_tx_queue_mapping(adapter) : NULL; if (unlikely(!tx_ring)) return -ENXIO; + if (unlikely(test_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags))) + return -ENXIO; + nq = txring_txq(tx_ring); __netif_tx_lock(nq, cpu); @@ -3042,6 +3036,7 @@ static const struct net_device_ops igb_netdev_ops = { .ndo_setup_tc = igb_setup_tc, .ndo_bpf = igb_xdp, .ndo_xdp_xmit = igb_xdp_xmit, + .ndo_xsk_wakeup = igb_xsk_wakeup, }; /** @@ -3338,7 +3333,8 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent) netdev->priv_flags |= IFF_SUPP_NOFCS; netdev->priv_flags |= IFF_UNICAST_FLT; - netdev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT; + netdev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | + NETDEV_XDP_ACT_XSK_ZEROCOPY; /* MTU range: 68 - 9216 */ netdev->min_mtu = ETH_MIN_MTU; @@ -4364,6 +4360,8 @@ void igb_configure_tx_ring(struct igb_adapter *adapter, u64 tdba = ring->dma; int reg_idx = ring->reg_idx; + WRITE_ONCE(ring->xsk_pool, igb_xsk_pool(adapter, ring)); + wr32(E1000_TDLEN(reg_idx), ring->count * sizeof(union e1000_adv_tx_desc)); wr32(E1000_TDBAL(reg_idx), @@ -4424,7 +4422,8 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring) if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) xdp_rxq_info_unreg(&rx_ring->xdp_rxq); res = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, - rx_ring->queue_index, 0); + rx_ring->queue_index, + rx_ring->q_vector->napi.napi_id); if (res < 0) { dev_err(dev, "Failed to register xdp_rxq index %u\n", rx_ring->queue_index); @@ -4720,12 +4719,17 @@ void igb_setup_srrctl(struct igb_adapter *adapter, struct igb_ring *ring) struct e1000_hw *hw = &adapter->hw; int reg_idx = ring->reg_idx; u32 srrctl = 0; + u32 buf_size; - srrctl = IGB_RX_HDR_LEN << E1000_SRRCTL_BSIZEHDRSIZE_SHIFT; - if (ring_uses_large_buffer(ring)) - srrctl |= IGB_RXBUFFER_3072 >> E1000_SRRCTL_BSIZEPKT_SHIFT; + if (ring->xsk_pool) + buf_size = xsk_pool_get_rx_frame_size(ring->xsk_pool); + else if (ring_uses_large_buffer(ring)) + buf_size = IGB_RXBUFFER_3072; else - srrctl |= IGB_RXBUFFER_2048 >> E1000_SRRCTL_BSIZEPKT_SHIFT; + buf_size = IGB_RXBUFFER_2048; + + srrctl = IGB_RX_HDR_LEN << E1000_SRRCTL_BSIZEHDRSIZE_SHIFT; + srrctl |= buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT; srrctl |= E1000_SRRCTL_DESCTYPE_ADV_ONEBUF; if (hw->mac.type >= e1000_82580) srrctl |= E1000_SRRCTL_TIMESTAMP; @@ -4757,8 +4761,17 @@ void igb_configure_rx_ring(struct igb_adapter *adapter, u32 rxdctl = 0; xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); - WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, - MEM_TYPE_PAGE_SHARED, NULL)); + WRITE_ONCE(ring->xsk_pool, igb_xsk_pool(adapter, ring)); + if (ring->xsk_pool) { + WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, + MEM_TYPE_XSK_BUFF_POOL, + NULL)); + xsk_pool_set_rxq_info(ring->xsk_pool, &ring->xdp_rxq); + } else { + WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, + MEM_TYPE_PAGE_SHARED, + NULL)); + } /* disable the queue */ wr32(E1000_RXDCTL(reg_idx), 0); @@ -4785,9 +4798,12 @@ void igb_configure_rx_ring(struct igb_adapter *adapter, rxdctl |= IGB_RX_HTHRESH << 8; rxdctl |= IGB_RX_WTHRESH << 16; - /* initialize rx_buffer_info */ - memset(ring->rx_buffer_info, 0, - sizeof(struct igb_rx_buffer) * ring->count); + if (ring->xsk_pool) + memset(ring->rx_buffer_info_zc, 0, + sizeof(*ring->rx_buffer_info_zc) * ring->count); + else + memset(ring->rx_buffer_info, 0, + sizeof(*ring->rx_buffer_info) * ring->count); /* initialize Rx descriptor 0 */ rx_desc = IGB_RX_DESC(ring, 0); @@ -4888,19 +4904,24 @@ static void igb_free_all_tx_resources(struct igb_adapter *adapter) * igb_clean_tx_ring - Free Tx Buffers * @tx_ring: ring to be cleaned **/ -static void igb_clean_tx_ring(struct igb_ring *tx_ring) +void igb_clean_tx_ring(struct igb_ring *tx_ring) { u16 i = tx_ring->next_to_clean; struct igb_tx_buffer *tx_buffer = &tx_ring->tx_buffer_info[i]; + u32 xsk_frames = 0; while (i != tx_ring->next_to_use) { union e1000_adv_tx_desc *eop_desc, *tx_desc; /* Free all the Tx ring sk_buffs or xdp frames */ - if (tx_buffer->type == IGB_TYPE_SKB) + if (tx_buffer->type == IGB_TYPE_SKB) { dev_kfree_skb_any(tx_buffer->skb); - else + } else if (tx_buffer->type == IGB_TYPE_XDP) { xdp_return_frame(tx_buffer->xdpf); + } else if (tx_buffer->type == IGB_TYPE_XSK) { + xsk_frames++; + goto skip_for_xsk; + } /* unmap skb header data */ dma_unmap_single(tx_ring->dev, @@ -4931,6 +4952,7 @@ static void igb_clean_tx_ring(struct igb_ring *tx_ring) DMA_TO_DEVICE); } +skip_for_xsk: tx_buffer->next_to_watch = NULL; /* move us one more past the eop_desc for start of next pkt */ @@ -4945,6 +4967,9 @@ static void igb_clean_tx_ring(struct igb_ring *tx_ring) /* reset BQL for queue */ netdev_tx_reset_queue(txring_txq(tx_ring)); + if (tx_ring->xsk_pool && xsk_frames) + xsk_tx_completed(tx_ring->xsk_pool, xsk_frames); + /* reset next_to_use and next_to_clean */ tx_ring->next_to_use = 0; tx_ring->next_to_clean = 0; @@ -4975,8 +5000,13 @@ void igb_free_rx_resources(struct igb_ring *rx_ring) rx_ring->xdp_prog = NULL; xdp_rxq_info_unreg(&rx_ring->xdp_rxq); - vfree(rx_ring->rx_buffer_info); - rx_ring->rx_buffer_info = NULL; + if (rx_ring->xsk_pool) { + vfree(rx_ring->rx_buffer_info_zc); + rx_ring->rx_buffer_info_zc = NULL; + } else { + vfree(rx_ring->rx_buffer_info); + rx_ring->rx_buffer_info = NULL; + } /* if not set, then don't free */ if (!rx_ring->desc) @@ -5007,13 +5037,18 @@ static void igb_free_all_rx_resources(struct igb_adapter *adapter) * igb_clean_rx_ring - Free Rx Buffers per Queue * @rx_ring: ring to free buffers from **/ -static void igb_clean_rx_ring(struct igb_ring *rx_ring) +void igb_clean_rx_ring(struct igb_ring *rx_ring) { u16 i = rx_ring->next_to_clean; dev_kfree_skb(rx_ring->skb); rx_ring->skb = NULL; + if (rx_ring->xsk_pool) { + igb_clean_rx_ring_zc(rx_ring); + goto skip_for_xsk; + } + /* Free all the Rx ring sk_buffs */ while (i != rx_ring->next_to_alloc) { struct igb_rx_buffer *buffer_info = &rx_ring->rx_buffer_info[i]; @@ -5041,6 +5076,7 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring) i = 0; } +skip_for_xsk: rx_ring->next_to_alloc = 0; rx_ring->next_to_clean = 0; rx_ring->next_to_use = 0; @@ -6467,6 +6503,9 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb, return NETDEV_TX_BUSY; } + if (unlikely(test_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags))) + return NETDEV_TX_BUSY; + /* record the location of the first descriptor for this packet */ first = &tx_ring->tx_buffer_info[tx_ring->next_to_use]; first->type = IGB_TYPE_SKB; @@ -6622,7 +6661,7 @@ static int igb_change_mtu(struct net_device *netdev, int new_mtu) struct igb_adapter *adapter = netdev_priv(netdev); int max_frame = new_mtu + IGB_ETH_PKT_HDR_PAD; - if (adapter->xdp_prog) { + if (igb_xdp_is_enabled(adapter)) { int i; for (i = 0; i < adapter->num_rx_queues; i++) { @@ -8195,6 +8234,7 @@ static int igb_poll(struct napi_struct *napi, int budget) struct igb_q_vector *q_vector = container_of(napi, struct igb_q_vector, napi); + struct xsk_buff_pool *xsk_pool; bool clean_complete = true; int work_done = 0; @@ -8206,7 +8246,12 @@ static int igb_poll(struct napi_struct *napi, int budget) clean_complete = igb_clean_tx_irq(q_vector, budget); if (q_vector->rx.ring) { - int cleaned = igb_clean_rx_irq(q_vector, budget); + int cleaned; + + xsk_pool = READ_ONCE(q_vector->rx.ring->xsk_pool); + cleaned = xsk_pool ? + igb_clean_rx_irq_zc(q_vector, xsk_pool, budget) : + igb_clean_rx_irq(q_vector, budget); work_done += cleaned; if (cleaned >= budget) @@ -8235,13 +8280,18 @@ static int igb_poll(struct napi_struct *napi, int budget) **/ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget) { - struct igb_adapter *adapter = q_vector->adapter; - struct igb_ring *tx_ring = q_vector->tx.ring; - struct igb_tx_buffer *tx_buffer; - union e1000_adv_tx_desc *tx_desc; unsigned int total_bytes = 0, total_packets = 0; + struct igb_adapter *adapter = q_vector->adapter; unsigned int budget = q_vector->tx.work_limit; + struct igb_ring *tx_ring = q_vector->tx.ring; unsigned int i = tx_ring->next_to_clean; + union e1000_adv_tx_desc *tx_desc; + struct igb_tx_buffer *tx_buffer; + struct xsk_buff_pool *xsk_pool; + int cpu = smp_processor_id(); + bool xsk_xmit_done = true; + struct netdev_queue *nq; + u32 xsk_frames = 0; if (test_bit(__IGB_DOWN, &adapter->state)) return true; @@ -8272,10 +8322,14 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget) total_packets += tx_buffer->gso_segs; /* free the skb */ - if (tx_buffer->type == IGB_TYPE_SKB) + if (tx_buffer->type == IGB_TYPE_SKB) { napi_consume_skb(tx_buffer->skb, napi_budget); - else + } else if (tx_buffer->type == IGB_TYPE_XDP) { xdp_return_frame(tx_buffer->xdpf); + } else if (tx_buffer->type == IGB_TYPE_XSK) { + xsk_frames++; + goto skip_for_xsk; + } /* unmap skb header data */ dma_unmap_single(tx_ring->dev, @@ -8307,6 +8361,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget) } } +skip_for_xsk: /* move us one more past the eop_desc for start of next pkt */ tx_buffer++; tx_desc++; @@ -8335,6 +8390,21 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget) q_vector->tx.total_bytes += total_bytes; q_vector->tx.total_packets += total_packets; + xsk_pool = READ_ONCE(tx_ring->xsk_pool); + if (xsk_pool) { + if (xsk_frames) + xsk_tx_completed(xsk_pool, xsk_frames); + if (xsk_uses_need_wakeup(xsk_pool)) + xsk_set_tx_need_wakeup(xsk_pool); + + nq = txring_txq(tx_ring); + __netif_tx_lock(nq, cpu); + /* Avoid transmit queue timeout since we share it with the slow path */ + txq_trans_cond_update(nq); + xsk_xmit_done = igb_xmit_zc(tx_ring, xsk_pool); + __netif_tx_unlock(nq); + } + if (test_bit(IGB_RING_FLAG_TX_DETECT_HANG, &tx_ring->flags)) { struct e1000_hw *hw = &adapter->hw; @@ -8397,7 +8467,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget) } } - return !!budget; + return !!budget && xsk_xmit_done; } /** @@ -8588,9 +8658,8 @@ static struct sk_buff *igb_build_skb(struct igb_ring *rx_ring, return skb; } -static struct sk_buff *igb_run_xdp(struct igb_adapter *adapter, - struct igb_ring *rx_ring, - struct xdp_buff *xdp) +static int igb_run_xdp(struct igb_adapter *adapter, struct igb_ring *rx_ring, + struct xdp_buff *xdp) { int err, result = IGB_XDP_PASS; struct bpf_prog *xdp_prog; @@ -8630,7 +8699,7 @@ out_failure: break; } xdp_out: - return ERR_PTR(-result); + return result; } static unsigned int igb_rx_frame_truesize(struct igb_ring *rx_ring, @@ -8756,10 +8825,6 @@ static bool igb_cleanup_headers(struct igb_ring *rx_ring, union e1000_adv_rx_desc *rx_desc, struct sk_buff *skb) { - /* XDP packets use error pointer so abort at this point */ - if (IS_ERR(skb)) - return true; - if (unlikely((igb_test_staterr(rx_desc, E1000_RXDEXT_ERR_FRAME_ERR_MASK)))) { struct net_device *netdev = rx_ring->netdev; @@ -8786,9 +8851,9 @@ static bool igb_cleanup_headers(struct igb_ring *rx_ring, * order to populate the hash, checksum, VLAN, timestamp, protocol, and * other fields within the skb. **/ -static void igb_process_skb_fields(struct igb_ring *rx_ring, - union e1000_adv_rx_desc *rx_desc, - struct sk_buff *skb) +void igb_process_skb_fields(struct igb_ring *rx_ring, + union e1000_adv_rx_desc *rx_desc, + struct sk_buff *skb) { struct net_device *dev = rx_ring->netdev; @@ -8870,6 +8935,38 @@ static void igb_put_rx_buffer(struct igb_ring *rx_ring, rx_buffer->page = NULL; } +void igb_finalize_xdp(struct igb_adapter *adapter, unsigned int status) +{ + int cpu = smp_processor_id(); + struct netdev_queue *nq; + + if (status & IGB_XDP_REDIR) + xdp_do_flush(); + + if (status & IGB_XDP_TX) { + struct igb_ring *tx_ring = igb_xdp_tx_queue_mapping(adapter); + + nq = txring_txq(tx_ring); + __netif_tx_lock(nq, cpu); + igb_xdp_ring_update_tail(tx_ring); + __netif_tx_unlock(nq); + } +} + +void igb_update_rx_stats(struct igb_q_vector *q_vector, unsigned int packets, + unsigned int bytes) +{ + struct igb_ring *ring = q_vector->rx.ring; + + u64_stats_update_begin(&ring->rx_syncp); + ring->rx_stats.packets += packets; + ring->rx_stats.bytes += bytes; + u64_stats_update_end(&ring->rx_syncp); + + q_vector->rx.total_packets += packets; + q_vector->rx.total_bytes += bytes; +} + static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) { unsigned int total_bytes = 0, total_packets = 0; @@ -8877,12 +8974,11 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) struct igb_ring *rx_ring = q_vector->rx.ring; u16 cleaned_count = igb_desc_unused(rx_ring); struct sk_buff *skb = rx_ring->skb; - int cpu = smp_processor_id(); unsigned int xdp_xmit = 0; - struct netdev_queue *nq; struct xdp_buff xdp; u32 frame_sz = 0; int rx_buf_pgcnt; + int xdp_res = 0; /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ #if (PAGE_SIZE < 8192) @@ -8940,12 +9036,10 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) /* At larger PAGE_SIZE, frame_sz depend on len size */ xdp.frame_sz = igb_rx_frame_truesize(rx_ring, size); #endif - skb = igb_run_xdp(adapter, rx_ring, &xdp); + xdp_res = igb_run_xdp(adapter, rx_ring, &xdp); } - if (IS_ERR(skb)) { - unsigned int xdp_res = -PTR_ERR(skb); - + if (xdp_res) { if (xdp_res & (IGB_XDP_TX | IGB_XDP_REDIR)) { xdp_xmit |= xdp_res; igb_rx_buffer_flip(rx_ring, rx_buffer, size); @@ -8964,7 +9058,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) &xdp, timestamp); /* exit if we failed to retrieve a buffer */ - if (!skb) { + if (!xdp_res && !skb) { rx_ring->rx_stats.alloc_failed++; rx_buffer->pagecnt_bias++; break; @@ -8978,7 +9072,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) continue; /* verify the packet layout is correct */ - if (igb_cleanup_headers(rx_ring, rx_desc, skb)) { + if (xdp_res || igb_cleanup_headers(rx_ring, rx_desc, skb)) { skb = NULL; continue; } @@ -9001,24 +9095,10 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) /* place incomplete frames back on ring for completion */ rx_ring->skb = skb; - if (xdp_xmit & IGB_XDP_REDIR) - xdp_do_flush(); - - if (xdp_xmit & IGB_XDP_TX) { - struct igb_ring *tx_ring = igb_xdp_tx_queue_mapping(adapter); - - nq = txring_txq(tx_ring); - __netif_tx_lock(nq, cpu); - igb_xdp_ring_update_tail(tx_ring); - __netif_tx_unlock(nq); - } + if (xdp_xmit) + igb_finalize_xdp(adapter, xdp_xmit); - u64_stats_update_begin(&rx_ring->rx_syncp); - rx_ring->rx_stats.packets += total_packets; - rx_ring->rx_stats.bytes += total_bytes; - u64_stats_update_end(&rx_ring->rx_syncp); - q_vector->rx.total_packets += total_packets; - q_vector->rx.total_bytes += total_bytes; + igb_update_rx_stats(q_vector, total_packets, total_bytes); if (cleaned_count) igb_alloc_rx_buffers(rx_ring, cleaned_count); diff --git a/drivers/net/ethernet/intel/igb/igb_xsk.c b/drivers/net/ethernet/intel/igb/igb_xsk.c new file mode 100644 index 000000000000..157d43787fa0 --- /dev/null +++ b/drivers/net/ethernet/intel/igb/igb_xsk.c @@ -0,0 +1,562 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2018 Intel Corporation. */ + +#include <linux/bpf_trace.h> +#include <net/xdp_sock_drv.h> +#include <net/xdp.h> + +#include "e1000_hw.h" +#include "igb.h" + +static int igb_realloc_rx_buffer_info(struct igb_ring *ring, bool pool_present) +{ + int size = pool_present ? + sizeof(*ring->rx_buffer_info_zc) * ring->count : + sizeof(*ring->rx_buffer_info) * ring->count; + void *buff_info = vmalloc(size); + + if (!buff_info) + return -ENOMEM; + + if (pool_present) { + vfree(ring->rx_buffer_info); + ring->rx_buffer_info = NULL; + ring->rx_buffer_info_zc = buff_info; + } else { + vfree(ring->rx_buffer_info_zc); + ring->rx_buffer_info_zc = NULL; + ring->rx_buffer_info = buff_info; + } + + return 0; +} + +static void igb_txrx_ring_disable(struct igb_adapter *adapter, u16 qid) +{ + struct igb_ring *tx_ring = adapter->tx_ring[qid]; + struct igb_ring *rx_ring = adapter->rx_ring[qid]; + struct e1000_hw *hw = &adapter->hw; + + set_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags); + + wr32(E1000_TXDCTL(tx_ring->reg_idx), 0); + wr32(E1000_RXDCTL(rx_ring->reg_idx), 0); + + synchronize_net(); + + /* Rx/Tx share the same napi context. */ + napi_disable(&rx_ring->q_vector->napi); + + igb_clean_tx_ring(tx_ring); + igb_clean_rx_ring(rx_ring); + + memset(&rx_ring->rx_stats, 0, sizeof(rx_ring->rx_stats)); + memset(&tx_ring->tx_stats, 0, sizeof(tx_ring->tx_stats)); +} + +static void igb_txrx_ring_enable(struct igb_adapter *adapter, u16 qid) +{ + struct igb_ring *tx_ring = adapter->tx_ring[qid]; + struct igb_ring *rx_ring = adapter->rx_ring[qid]; + + igb_configure_tx_ring(adapter, tx_ring); + igb_configure_rx_ring(adapter, rx_ring); + + synchronize_net(); + + clear_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags); + + /* call igb_desc_unused which always leaves + * at least 1 descriptor unused to make sure + * next_to_use != next_to_clean + */ + if (rx_ring->xsk_pool) + igb_alloc_rx_buffers_zc(rx_ring, rx_ring->xsk_pool, + igb_desc_unused(rx_ring)); + else + igb_alloc_rx_buffers(rx_ring, igb_desc_unused(rx_ring)); + + /* Rx/Tx share the same napi context. */ + napi_enable(&rx_ring->q_vector->napi); +} + +struct xsk_buff_pool *igb_xsk_pool(struct igb_adapter *adapter, + struct igb_ring *ring) +{ + int qid = ring->queue_index; + struct xsk_buff_pool *pool; + + pool = xsk_get_pool_from_qid(adapter->netdev, qid); + + if (!igb_xdp_is_enabled(adapter)) + return NULL; + + return (pool && pool->dev) ? pool : NULL; +} + +static int igb_xsk_pool_enable(struct igb_adapter *adapter, + struct xsk_buff_pool *pool, + u16 qid) +{ + struct net_device *netdev = adapter->netdev; + struct igb_ring *rx_ring; + bool if_running; + int err; + + if (qid >= adapter->num_rx_queues) + return -EINVAL; + + if (qid >= netdev->real_num_rx_queues || + qid >= netdev->real_num_tx_queues) + return -EINVAL; + + err = xsk_pool_dma_map(pool, &adapter->pdev->dev, IGB_RX_DMA_ATTR); + if (err) + return err; + + rx_ring = adapter->rx_ring[qid]; + if_running = netif_running(adapter->netdev) && igb_xdp_is_enabled(adapter); + if (if_running) + igb_txrx_ring_disable(adapter, qid); + + if (if_running) { + err = igb_realloc_rx_buffer_info(rx_ring, true); + if (!err) { + igb_txrx_ring_enable(adapter, qid); + /* Kick start the NAPI context so that receiving will start */ + err = igb_xsk_wakeup(adapter->netdev, qid, XDP_WAKEUP_RX); + } + + if (err) { + xsk_pool_dma_unmap(pool, IGB_RX_DMA_ATTR); + return err; + } + } + + return 0; +} + +static int igb_xsk_pool_disable(struct igb_adapter *adapter, u16 qid) +{ + struct xsk_buff_pool *pool; + struct igb_ring *rx_ring; + bool if_running; + int err; + + pool = xsk_get_pool_from_qid(adapter->netdev, qid); + if (!pool) + return -EINVAL; + + rx_ring = adapter->rx_ring[qid]; + if_running = netif_running(adapter->netdev) && igb_xdp_is_enabled(adapter); + if (if_running) + igb_txrx_ring_disable(adapter, qid); + + xsk_pool_dma_unmap(pool, IGB_RX_DMA_ATTR); + + if (if_running) { + err = igb_realloc_rx_buffer_info(rx_ring, false); + if (err) + return err; + + igb_txrx_ring_enable(adapter, qid); + } + + return 0; +} + +int igb_xsk_pool_setup(struct igb_adapter *adapter, + struct xsk_buff_pool *pool, + u16 qid) +{ + return pool ? igb_xsk_pool_enable(adapter, pool, qid) : + igb_xsk_pool_disable(adapter, qid); +} + +static u16 igb_fill_rx_descs(struct xsk_buff_pool *pool, struct xdp_buff **xdp, + union e1000_adv_rx_desc *rx_desc, u16 count) +{ + dma_addr_t dma; + u16 buffs; + int i; + + /* nothing to do */ + if (!count) + return 0; + + buffs = xsk_buff_alloc_batch(pool, xdp, count); + for (i = 0; i < buffs; i++) { + dma = xsk_buff_xdp_get_dma(*xdp); + rx_desc->read.pkt_addr = cpu_to_le64(dma); + rx_desc->wb.upper.length = 0; + + rx_desc++; + xdp++; + } + + return buffs; +} + +bool igb_alloc_rx_buffers_zc(struct igb_ring *rx_ring, + struct xsk_buff_pool *xsk_pool, u16 count) +{ + u32 nb_buffs_extra = 0, nb_buffs = 0; + union e1000_adv_rx_desc *rx_desc; + u16 ntu = rx_ring->next_to_use; + u16 total_count = count; + struct xdp_buff **xdp; + + rx_desc = IGB_RX_DESC(rx_ring, ntu); + xdp = &rx_ring->rx_buffer_info_zc[ntu]; + + if (ntu + count >= rx_ring->count) { + nb_buffs_extra = igb_fill_rx_descs(xsk_pool, xdp, rx_desc, + rx_ring->count - ntu); + if (nb_buffs_extra != rx_ring->count - ntu) { + ntu += nb_buffs_extra; + goto exit; + } + rx_desc = IGB_RX_DESC(rx_ring, 0); + xdp = rx_ring->rx_buffer_info_zc; + ntu = 0; + count -= nb_buffs_extra; + } + + nb_buffs = igb_fill_rx_descs(xsk_pool, xdp, rx_desc, count); + ntu += nb_buffs; + if (ntu == rx_ring->count) + ntu = 0; + + /* clear the length for the next_to_use descriptor */ + rx_desc = IGB_RX_DESC(rx_ring, ntu); + rx_desc->wb.upper.length = 0; + +exit: + if (rx_ring->next_to_use != ntu) { + rx_ring->next_to_use = ntu; + + /* Force memory writes to complete before letting h/w + * know there are new descriptors to fetch. (Only + * applicable for weak-ordered memory model archs, + * such as IA-64). + */ + wmb(); + writel(ntu, rx_ring->tail); + } + + return total_count == (nb_buffs + nb_buffs_extra); +} + +void igb_clean_rx_ring_zc(struct igb_ring *rx_ring) +{ + u16 ntc = rx_ring->next_to_clean; + u16 ntu = rx_ring->next_to_use; + + while (ntc != ntu) { + struct xdp_buff *xdp = rx_ring->rx_buffer_info_zc[ntc]; + + xsk_buff_free(xdp); + ntc++; + if (ntc >= rx_ring->count) + ntc = 0; + } +} + +static struct sk_buff *igb_construct_skb_zc(struct igb_ring *rx_ring, + struct xdp_buff *xdp, + ktime_t timestamp) +{ + unsigned int totalsize = xdp->data_end - xdp->data_meta; + unsigned int metasize = xdp->data - xdp->data_meta; + struct sk_buff *skb; + + net_prefetch(xdp->data_meta); + + /* allocate a skb to store the frags */ + skb = napi_alloc_skb(&rx_ring->q_vector->napi, totalsize); + if (unlikely(!skb)) + return NULL; + + if (timestamp) + skb_hwtstamps(skb)->hwtstamp = timestamp; + + memcpy(__skb_put(skb, totalsize), xdp->data_meta, + ALIGN(totalsize, sizeof(long))); + + if (metasize) { + skb_metadata_set(skb, metasize); + __skb_pull(skb, metasize); + } + + return skb; +} + +static int igb_run_xdp_zc(struct igb_adapter *adapter, struct igb_ring *rx_ring, + struct xdp_buff *xdp, struct xsk_buff_pool *xsk_pool, + struct bpf_prog *xdp_prog) +{ + int err, result = IGB_XDP_PASS; + u32 act; + + prefetchw(xdp->data_hard_start); /* xdp_frame write */ + + act = bpf_prog_run_xdp(xdp_prog, xdp); + + if (likely(act == XDP_REDIRECT)) { + err = xdp_do_redirect(adapter->netdev, xdp, xdp_prog); + if (!err) + return IGB_XDP_REDIR; + + if (xsk_uses_need_wakeup(xsk_pool) && + err == -ENOBUFS) + result = IGB_XDP_EXIT; + else + result = IGB_XDP_CONSUMED; + goto out_failure; + } + + switch (act) { + case XDP_PASS: + break; + case XDP_TX: + result = igb_xdp_xmit_back(adapter, xdp); + if (result == IGB_XDP_CONSUMED) + goto out_failure; + break; + default: + bpf_warn_invalid_xdp_action(adapter->netdev, xdp_prog, act); + fallthrough; + case XDP_ABORTED: +out_failure: + trace_xdp_exception(rx_ring->netdev, xdp_prog, act); + fallthrough; + case XDP_DROP: + result = IGB_XDP_CONSUMED; + break; + } + + return result; +} + +int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector, + struct xsk_buff_pool *xsk_pool, const int budget) +{ + struct igb_adapter *adapter = q_vector->adapter; + unsigned int total_bytes = 0, total_packets = 0; + struct igb_ring *rx_ring = q_vector->rx.ring; + u32 ntc = rx_ring->next_to_clean; + struct bpf_prog *xdp_prog; + unsigned int xdp_xmit = 0; + bool failure = false; + u16 entries_to_alloc; + struct sk_buff *skb; + + /* xdp_prog cannot be NULL in the ZC path */ + xdp_prog = READ_ONCE(rx_ring->xdp_prog); + + while (likely(total_packets < budget)) { + union e1000_adv_rx_desc *rx_desc; + ktime_t timestamp = 0; + struct xdp_buff *xdp; + unsigned int size; + int xdp_res = 0; + + rx_desc = IGB_RX_DESC(rx_ring, ntc); + size = le16_to_cpu(rx_desc->wb.upper.length); + if (!size) + break; + + /* This memory barrier is needed to keep us from reading + * any other fields out of the rx_desc until we know the + * descriptor has been written back + */ + dma_rmb(); + + xdp = rx_ring->rx_buffer_info_zc[ntc]; + xsk_buff_set_size(xdp, size); + xsk_buff_dma_sync_for_cpu(xdp); + + /* pull rx packet timestamp if available and valid */ + if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) { + int ts_hdr_len; + + ts_hdr_len = igb_ptp_rx_pktstamp(rx_ring->q_vector, + xdp->data, + ×tamp); + + xdp->data += ts_hdr_len; + xdp->data_meta += ts_hdr_len; + size -= ts_hdr_len; + } + + xdp_res = igb_run_xdp_zc(adapter, rx_ring, xdp, xsk_pool, + xdp_prog); + + if (xdp_res) { + if (likely(xdp_res & (IGB_XDP_TX | IGB_XDP_REDIR))) { + xdp_xmit |= xdp_res; + } else if (xdp_res == IGB_XDP_EXIT) { + failure = true; + break; + } else if (xdp_res == IGB_XDP_CONSUMED) { + xsk_buff_free(xdp); + } + + total_packets++; + total_bytes += size; + ntc++; + if (ntc == rx_ring->count) + ntc = 0; + continue; + } + + skb = igb_construct_skb_zc(rx_ring, xdp, timestamp); + + /* exit if we failed to retrieve a buffer */ + if (!skb) { + rx_ring->rx_stats.alloc_failed++; + break; + } + + xsk_buff_free(xdp); + ntc++; + if (ntc == rx_ring->count) + ntc = 0; + + if (eth_skb_pad(skb)) + continue; + + /* probably a little skewed due to removing CRC */ + total_bytes += skb->len; + + /* populate checksum, timestamp, VLAN, and protocol */ + igb_process_skb_fields(rx_ring, rx_desc, skb); + + napi_gro_receive(&q_vector->napi, skb); + + /* update budget accounting */ + total_packets++; + } + + rx_ring->next_to_clean = ntc; + + if (xdp_xmit) + igb_finalize_xdp(adapter, xdp_xmit); + + igb_update_rx_stats(q_vector, total_packets, total_bytes); + + entries_to_alloc = igb_desc_unused(rx_ring); + if (entries_to_alloc >= IGB_RX_BUFFER_WRITE) + failure |= !igb_alloc_rx_buffers_zc(rx_ring, xsk_pool, + entries_to_alloc); + + if (xsk_uses_need_wakeup(xsk_pool)) { + if (failure || rx_ring->next_to_clean == rx_ring->next_to_use) + xsk_set_rx_need_wakeup(xsk_pool); + else + xsk_clear_rx_need_wakeup(xsk_pool); + + return (int)total_packets; + } + return failure ? budget : (int)total_packets; +} + +bool igb_xmit_zc(struct igb_ring *tx_ring, struct xsk_buff_pool *xsk_pool) +{ + unsigned int budget = igb_desc_unused(tx_ring); + u32 cmd_type, olinfo_status, nb_pkts, i = 0; + struct xdp_desc *descs = xsk_pool->tx_descs; + union e1000_adv_tx_desc *tx_desc = NULL; + struct igb_tx_buffer *tx_buffer_info; + unsigned int total_bytes = 0; + dma_addr_t dma; + + if (!netif_carrier_ok(tx_ring->netdev)) + return true; + + if (test_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags)) + return true; + + nb_pkts = xsk_tx_peek_release_desc_batch(xsk_pool, budget); + if (!nb_pkts) + return true; + + while (nb_pkts-- > 0) { + dma = xsk_buff_raw_get_dma(xsk_pool, descs[i].addr); + xsk_buff_raw_dma_sync_for_device(xsk_pool, dma, descs[i].len); + + tx_buffer_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use]; + tx_buffer_info->bytecount = descs[i].len; + tx_buffer_info->type = IGB_TYPE_XSK; + tx_buffer_info->xdpf = NULL; + tx_buffer_info->gso_segs = 1; + tx_buffer_info->time_stamp = jiffies; + + tx_desc = IGB_TX_DESC(tx_ring, tx_ring->next_to_use); + tx_desc->read.buffer_addr = cpu_to_le64(dma); + + /* put descriptor type bits */ + cmd_type = E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_DEXT | + E1000_ADVTXD_DCMD_IFCS; + olinfo_status = descs[i].len << E1000_ADVTXD_PAYLEN_SHIFT; + + /* FIXME: This sets the Report Status (RS) bit for every + * descriptor. One nice to have optimization would be to set it + * only for the last descriptor in the whole batch. See Intel + * ice driver for an example on how to do it. + */ + cmd_type |= descs[i].len | IGB_TXD_DCMD; + tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type); + tx_desc->read.olinfo_status = cpu_to_le32(olinfo_status); + + total_bytes += descs[i].len; + + i++; + tx_ring->next_to_use++; + tx_buffer_info->next_to_watch = tx_desc; + if (tx_ring->next_to_use == tx_ring->count) + tx_ring->next_to_use = 0; + } + + netdev_tx_sent_queue(txring_txq(tx_ring), total_bytes); + igb_xdp_ring_update_tail(tx_ring); + + return nb_pkts < budget; +} + +int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) +{ + struct igb_adapter *adapter = netdev_priv(dev); + struct e1000_hw *hw = &adapter->hw; + struct igb_ring *ring; + u32 eics = 0; + + if (test_bit(__IGB_DOWN, &adapter->state)) + return -ENETDOWN; + + if (!igb_xdp_is_enabled(adapter)) + return -EINVAL; + + if (qid >= adapter->num_tx_queues) + return -EINVAL; + + ring = adapter->tx_ring[qid]; + + if (test_bit(IGB_RING_FLAG_TX_DISABLED, &ring->flags)) + return -ENETDOWN; + + if (!READ_ONCE(ring->xsk_pool)) + return -EINVAL; + + if (!napi_if_scheduled_mark_missed(&ring->q_vector->napi)) { + /* Cause software interrupt */ + if (adapter->flags & IGB_FLAG_HAS_MSIX) { + eics |= ring->q_vector->eims_value; + wr32(E1000_EICS, eics); + } else { + wr32(E1000_ICS, E1000_ICS_RXDMT0); + } + } + + return 0; +} diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index eac0f966e0e4..b8111ad9a9a8 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -337,6 +337,8 @@ struct igc_adapter { struct igc_led_classdev *leds; }; +void igc_set_queue_napi(struct igc_adapter *adapter, int q_idx, + struct napi_struct *napi); void igc_up(struct igc_adapter *adapter); void igc_down(struct igc_adapter *adapter); int igc_open(struct net_device *netdev); diff --git a/drivers/net/ethernet/intel/igc/igc_base.c b/drivers/net/ethernet/intel/igc/igc_base.c index 9fae8bdec2a7..1613b562d17c 100644 --- a/drivers/net/ethernet/intel/igc/igc_base.c +++ b/drivers/net/ethernet/intel/igc/igc_base.c @@ -68,6 +68,10 @@ static s32 igc_init_nvm_params_base(struct igc_hw *hw) u32 eecd = rd32(IGC_EECD); u16 size; + /* failed to read reg and got all F's */ + if (!(~eecd)) + return -ENXIO; + size = FIELD_GET(IGC_EECD_SIZE_EX_MASK, eecd); /* Added to a constant, "size" becomes the left-shift value @@ -221,6 +225,8 @@ static s32 igc_get_invariants_base(struct igc_hw *hw) /* NVM initialization */ ret_val = igc_init_nvm_params_base(hw); + if (ret_val) + goto out; switch (hw->mac.type) { case igc_i225: ret_val = igc_init_nvm_params_i225(hw); diff --git a/drivers/net/ethernet/intel/igc/igc_hw.h b/drivers/net/ethernet/intel/igc/igc_hw.h index d9d1a1a11daf..be8a49a86d09 100644 --- a/drivers/net/ethernet/intel/igc/igc_hw.h +++ b/drivers/net/ethernet/intel/igc/igc_hw.h @@ -279,9 +279,4 @@ struct net_device *igc_get_hw_dev(struct igc_hw *hw); #define hw_dbg(format, arg...) \ netdev_dbg(igc_get_hw_dev(hw), format, ##arg) -s32 igc_read_pcie_cap_reg(struct igc_hw *hw, u32 reg, u16 *value); -s32 igc_write_pcie_cap_reg(struct igc_hw *hw, u32 reg, u16 *value); -void igc_read_pci_cfg(struct igc_hw *hw, u32 reg, u16 *value); -void igc_write_pci_cfg(struct igc_hw *hw, u32 reg, u16 *value); - #endif /* _IGC_HW_H_ */ diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 27872bdea9bd..56a35d58e7a6 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2123,10 +2123,6 @@ static bool igc_cleanup_headers(struct igc_ring *rx_ring, union igc_adv_rx_desc *rx_desc, struct sk_buff *skb) { - /* XDP packets use error pointer so abort at this point */ - if (IS_ERR(skb)) - return true; - if (unlikely(igc_test_staterr(rx_desc, IGC_RXDEXT_STATERR_RXE))) { struct net_device *netdev = rx_ring->netdev; @@ -2515,8 +2511,7 @@ out_failure: } } -static struct sk_buff *igc_xdp_run_prog(struct igc_adapter *adapter, - struct xdp_buff *xdp) +static int igc_xdp_run_prog(struct igc_adapter *adapter, struct xdp_buff *xdp) { struct bpf_prog *prog; int res; @@ -2530,7 +2525,7 @@ static struct sk_buff *igc_xdp_run_prog(struct igc_adapter *adapter, res = __igc_xdp_run_prog(adapter, prog, xdp); out: - return ERR_PTR(-res); + return res; } /* This function assumes __netif_tx_lock is held by the caller. */ @@ -2585,6 +2580,7 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget) struct sk_buff *skb = rx_ring->skb; u16 cleaned_count = igc_desc_unused(rx_ring); int xdp_status = 0, rx_buffer_pgcnt; + int xdp_res = 0; while (likely(total_packets < budget)) { struct igc_xdp_buff ctx = { .rx_ts = NULL }; @@ -2630,12 +2626,10 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget) xdp_buff_clear_frags_flag(&ctx.xdp); ctx.rx_desc = rx_desc; - skb = igc_xdp_run_prog(adapter, &ctx.xdp); + xdp_res = igc_xdp_run_prog(adapter, &ctx.xdp); } - if (IS_ERR(skb)) { - unsigned int xdp_res = -PTR_ERR(skb); - + if (xdp_res) { switch (xdp_res) { case IGC_XDP_CONSUMED: rx_buffer->pagecnt_bias++; @@ -2657,7 +2651,7 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget) skb = igc_construct_skb(rx_ring, rx_buffer, &ctx); /* exit if we failed to retrieve a buffer */ - if (!skb) { + if (!xdp_res && !skb) { rx_ring->rx_stats.alloc_failed++; rx_buffer->pagecnt_bias++; set_bit(IGC_RING_FLAG_RX_ALLOC_FAILED, &rx_ring->flags); @@ -2672,7 +2666,7 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget) continue; /* verify the packet layout is correct */ - if (igc_cleanup_headers(rx_ring, rx_desc, skb)) { + if (xdp_res || igc_cleanup_headers(rx_ring, rx_desc, skb)) { skb = NULL; continue; } @@ -4948,6 +4942,22 @@ static int igc_sw_init(struct igc_adapter *adapter) return 0; } +void igc_set_queue_napi(struct igc_adapter *adapter, int vector, + struct napi_struct *napi) +{ + struct igc_q_vector *q_vector = adapter->q_vector[vector]; + + if (q_vector->rx.ring) + netif_queue_set_napi(adapter->netdev, + q_vector->rx.ring->queue_index, + NETDEV_QUEUE_TYPE_RX, napi); + + if (q_vector->tx.ring) + netif_queue_set_napi(adapter->netdev, + q_vector->tx.ring->queue_index, + NETDEV_QUEUE_TYPE_TX, napi); +} + /** * igc_up - Open the interface and prepare it to handle traffic * @adapter: board private structure @@ -4955,6 +4965,7 @@ static int igc_sw_init(struct igc_adapter *adapter) void igc_up(struct igc_adapter *adapter) { struct igc_hw *hw = &adapter->hw; + struct napi_struct *napi; int i = 0; /* hardware has been reset, we need to reload some things */ @@ -4962,8 +4973,11 @@ void igc_up(struct igc_adapter *adapter) clear_bit(__IGC_DOWN, &adapter->state); - for (i = 0; i < adapter->num_q_vectors; i++) - napi_enable(&adapter->q_vector[i]->napi); + for (i = 0; i < adapter->num_q_vectors; i++) { + napi = &adapter->q_vector[i]->napi; + napi_enable(napi); + igc_set_queue_napi(adapter, i, napi); + } if (adapter->msix_entries) igc_configure_msix(adapter); @@ -5192,6 +5206,7 @@ void igc_down(struct igc_adapter *adapter) for (i = 0; i < adapter->num_q_vectors; i++) { if (adapter->q_vector[i]) { napi_synchronize(&adapter->q_vector[i]->napi); + igc_set_queue_napi(adapter, i, NULL); napi_disable(&adapter->q_vector[i]->napi); } } @@ -5576,6 +5591,9 @@ static int igc_request_msix(struct igc_adapter *adapter) q_vector); if (err) goto err_free; + + netif_napi_set_irq(&q_vector->napi, + adapter->msix_entries[vector].vector); } igc_configure_msix(adapter); @@ -6018,6 +6036,7 @@ static int __igc_open(struct net_device *netdev, bool resuming) struct igc_adapter *adapter = netdev_priv(netdev); struct pci_dev *pdev = adapter->pdev; struct igc_hw *hw = &adapter->hw; + struct napi_struct *napi; int err = 0; int i = 0; @@ -6053,8 +6072,11 @@ static int __igc_open(struct net_device *netdev, bool resuming) clear_bit(__IGC_DOWN, &adapter->state); - for (i = 0; i < adapter->num_q_vectors; i++) - napi_enable(&adapter->q_vector[i]->napi); + for (i = 0; i < adapter->num_q_vectors; i++) { + napi = &adapter->q_vector[i]->napi; + napi_enable(napi); + igc_set_queue_napi(adapter, i, napi); + } /* Clear any pending interrupts. */ rd32(IGC_ICR); @@ -6779,45 +6801,6 @@ static const struct net_device_ops igc_netdev_ops = { .ndo_get_tstamp = igc_get_tstamp, }; -/* PCIe configuration access */ -void igc_read_pci_cfg(struct igc_hw *hw, u32 reg, u16 *value) -{ - struct igc_adapter *adapter = hw->back; - - pci_read_config_word(adapter->pdev, reg, value); -} - -void igc_write_pci_cfg(struct igc_hw *hw, u32 reg, u16 *value) -{ - struct igc_adapter *adapter = hw->back; - - pci_write_config_word(adapter->pdev, reg, *value); -} - -s32 igc_read_pcie_cap_reg(struct igc_hw *hw, u32 reg, u16 *value) -{ - struct igc_adapter *adapter = hw->back; - - if (!pci_is_pcie(adapter->pdev)) - return -IGC_ERR_CONFIG; - - pcie_capability_read_word(adapter->pdev, reg, value); - - return IGC_SUCCESS; -} - -s32 igc_write_pcie_cap_reg(struct igc_hw *hw, u32 reg, u16 *value) -{ - struct igc_adapter *adapter = hw->back; - - if (!pci_is_pcie(adapter->pdev)) - return -IGC_ERR_CONFIG; - - pcie_capability_write_word(adapter->pdev, reg, *value); - - return IGC_SUCCESS; -} - u32 igc_rd32(struct igc_hw *hw, u32 reg) { struct igc_adapter *igc = container_of(hw, struct igc_adapter, hw); @@ -7338,7 +7321,7 @@ static void igc_deliver_wake_packet(struct net_device *netdev) netif_rx(skb); } -static int igc_resume(struct device *dev) +static int __igc_resume(struct device *dev, bool rpm) { struct pci_dev *pdev = to_pci_dev(dev); struct net_device *netdev = pci_get_drvdata(pdev); @@ -7381,7 +7364,11 @@ static int igc_resume(struct device *dev) wr32(IGC_WUS, ~0); if (netif_running(netdev)) { + if (!rpm) + rtnl_lock(); err = __igc_open(netdev, true); + if (!rpm) + rtnl_unlock(); if (!err) netif_device_attach(netdev); } @@ -7389,9 +7376,14 @@ static int igc_resume(struct device *dev) return err; } +static int igc_resume(struct device *dev) +{ + return __igc_resume(dev, false); +} + static int igc_runtime_resume(struct device *dev) { - return igc_resume(dev); + return __igc_resume(dev, true); } static int igc_suspend(struct device *dev) @@ -7436,14 +7428,18 @@ static pci_ers_result_t igc_io_error_detected(struct pci_dev *pdev, struct net_device *netdev = pci_get_drvdata(pdev); struct igc_adapter *adapter = netdev_priv(netdev); + rtnl_lock(); netif_device_detach(netdev); - if (state == pci_channel_io_perm_failure) + if (state == pci_channel_io_perm_failure) { + rtnl_unlock(); return PCI_ERS_RESULT_DISCONNECT; + } if (netif_running(netdev)) igc_down(adapter); pci_disable_device(pdev); + rtnl_unlock(); /* Request a slot reset. */ return PCI_ERS_RESULT_NEED_RESET; @@ -7454,7 +7450,7 @@ static pci_ers_result_t igc_io_error_detected(struct pci_dev *pdev, * @pdev: Pointer to PCI device * * Restart the card from scratch, as if from a cold-boot. Implementation - * resembles the first-half of the igc_resume routine. + * resembles the first-half of the __igc_resume routine. **/ static pci_ers_result_t igc_io_slot_reset(struct pci_dev *pdev) { @@ -7493,7 +7489,7 @@ static pci_ers_result_t igc_io_slot_reset(struct pci_dev *pdev) * * This callback is called when the error recovery driver tells us that * its OK to resume normal operation. Implementation resembles the - * second-half of the igc_resume routine. + * second-half of the __igc_resume routine. */ static void igc_io_resume(struct pci_dev *pdev) { diff --git a/drivers/net/ethernet/intel/igc/igc_nvm.c b/drivers/net/ethernet/intel/igc/igc_nvm.c index 58f81aba0144..efd121c03967 100644 --- a/drivers/net/ethernet/intel/igc/igc_nvm.c +++ b/drivers/net/ethernet/intel/igc/igc_nvm.c @@ -36,56 +36,6 @@ static s32 igc_poll_eerd_eewr_done(struct igc_hw *hw, int ee_reg) } /** - * igc_acquire_nvm - Generic request for access to EEPROM - * @hw: pointer to the HW structure - * - * Set the EEPROM access request bit and wait for EEPROM access grant bit. - * Return successful if access grant bit set, else clear the request for - * EEPROM access and return -IGC_ERR_NVM (-1). - */ -s32 igc_acquire_nvm(struct igc_hw *hw) -{ - s32 timeout = IGC_NVM_GRANT_ATTEMPTS; - u32 eecd = rd32(IGC_EECD); - s32 ret_val = 0; - - wr32(IGC_EECD, eecd | IGC_EECD_REQ); - eecd = rd32(IGC_EECD); - - while (timeout) { - if (eecd & IGC_EECD_GNT) - break; - udelay(5); - eecd = rd32(IGC_EECD); - timeout--; - } - - if (!timeout) { - eecd &= ~IGC_EECD_REQ; - wr32(IGC_EECD, eecd); - hw_dbg("Could not acquire NVM grant\n"); - ret_val = -IGC_ERR_NVM; - } - - return ret_val; -} - -/** - * igc_release_nvm - Release exclusive access to EEPROM - * @hw: pointer to the HW structure - * - * Stop any current commands to the EEPROM and clear the EEPROM request bit. - */ -void igc_release_nvm(struct igc_hw *hw) -{ - u32 eecd; - - eecd = rd32(IGC_EECD); - eecd &= ~IGC_EECD_REQ; - wr32(IGC_EECD, eecd); -} - -/** * igc_read_nvm_eerd - Reads EEPROM using EERD register * @hw: pointer to the HW structure * @offset: offset of word in the EEPROM to read diff --git a/drivers/net/ethernet/intel/igc/igc_nvm.h b/drivers/net/ethernet/intel/igc/igc_nvm.h index f9fc2e9cfb03..ab78d0c64547 100644 --- a/drivers/net/ethernet/intel/igc/igc_nvm.h +++ b/drivers/net/ethernet/intel/igc/igc_nvm.h @@ -4,8 +4,6 @@ #ifndef _IGC_NVM_H_ #define _IGC_NVM_H_ -s32 igc_acquire_nvm(struct igc_hw *hw); -void igc_release_nvm(struct igc_hw *hw); s32 igc_read_mac_addr(struct igc_hw *hw); s32 igc_read_nvm_eerd(struct igc_hw *hw, u16 offset, u16 words, u16 *data); s32 igc_validate_nvm_checksum(struct igc_hw *hw); diff --git a/drivers/net/ethernet/intel/igc/igc_xdp.c b/drivers/net/ethernet/intel/igc/igc_xdp.c index e27af72aada8..13bbd3346e01 100644 --- a/drivers/net/ethernet/intel/igc/igc_xdp.c +++ b/drivers/net/ethernet/intel/igc/igc_xdp.c @@ -13,6 +13,7 @@ int igc_xdp_set_prog(struct igc_adapter *adapter, struct bpf_prog *prog, struct net_device *dev = adapter->netdev; bool if_running = netif_running(dev); struct bpf_prog *old_prog; + bool need_update; if (dev->mtu > ETH_DATA_LEN) { /* For now, the driver doesn't support XDP functionality with @@ -22,7 +23,8 @@ int igc_xdp_set_prog(struct igc_adapter *adapter, struct bpf_prog *prog, return -EOPNOTSUPP; } - if (if_running) + need_update = !!adapter->xdp_prog != !!prog; + if (if_running && need_update) igc_close(dev); old_prog = xchg(&adapter->xdp_prog, prog); @@ -34,7 +36,7 @@ int igc_xdp_set_prog(struct igc_adapter *adapter, struct bpf_prog *prog, else xdp_features_clear_redirect_target(dev); - if (if_running) + if (if_running && need_update) igc_open(dev); return 0; @@ -84,6 +86,7 @@ static int igc_xdp_enable_pool(struct igc_adapter *adapter, napi_disable(napi); } + igc_set_queue_napi(adapter, queue_id, NULL); set_bit(IGC_RING_FLAG_AF_XDP_ZC, &rx_ring->flags); set_bit(IGC_RING_FLAG_AF_XDP_ZC, &tx_ring->flags); @@ -133,6 +136,7 @@ static int igc_xdp_disable_pool(struct igc_adapter *adapter, u16 queue_id) xsk_pool_dma_unmap(pool, IGC_RX_DMA_ATTR); clear_bit(IGC_RING_FLAG_AF_XDP_ZC, &rx_ring->flags); clear_bit(IGC_RING_FLAG_AF_XDP_ZC, &tx_ring->flags); + igc_set_queue_napi(adapter, queue_id, napi); if (needs_reset) { napi_enable(napi); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 336e08d35f97..7236f20c9a30 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1923,10 +1923,6 @@ bool ixgbe_cleanup_headers(struct ixgbe_ring *rx_ring, { struct net_device *netdev = rx_ring->netdev; - /* XDP packets use error pointer so abort at this point */ - if (IS_ERR(skb)) - return true; - /* Verify netdev is present, and that packet does not have any * errors that would be unacceptable to the netdev. */ @@ -2234,9 +2230,9 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring, return skb; } -static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter, - struct ixgbe_ring *rx_ring, - struct xdp_buff *xdp) +static int ixgbe_run_xdp(struct ixgbe_adapter *adapter, + struct ixgbe_ring *rx_ring, + struct xdp_buff *xdp) { int err, result = IXGBE_XDP_PASS; struct bpf_prog *xdp_prog; @@ -2286,7 +2282,7 @@ out_failure: break; } xdp_out: - return ERR_PTR(-result); + return result; } static unsigned int ixgbe_rx_frame_truesize(struct ixgbe_ring *rx_ring, @@ -2344,6 +2340,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, unsigned int offset = rx_ring->rx_offset; unsigned int xdp_xmit = 0; struct xdp_buff xdp; + int xdp_res = 0; /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ #if (PAGE_SIZE < 8192) @@ -2389,12 +2386,10 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, /* At larger PAGE_SIZE, frame_sz depend on len size */ xdp.frame_sz = ixgbe_rx_frame_truesize(rx_ring, size); #endif - skb = ixgbe_run_xdp(adapter, rx_ring, &xdp); + xdp_res = ixgbe_run_xdp(adapter, rx_ring, &xdp); } - if (IS_ERR(skb)) { - unsigned int xdp_res = -PTR_ERR(skb); - + if (xdp_res) { if (xdp_res & (IXGBE_XDP_TX | IXGBE_XDP_REDIR)) { xdp_xmit |= xdp_res; ixgbe_rx_buffer_flip(rx_ring, rx_buffer, size); @@ -2414,7 +2409,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, } /* exit if we failed to retrieve a buffer */ - if (!skb) { + if (!xdp_res && !skb) { rx_ring->rx_stats.alloc_rx_buff_failed++; rx_buffer->pagecnt_bias++; break; @@ -2428,7 +2423,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, continue; /* verify the packet layout is correct */ - if (ixgbe_cleanup_headers(rx_ring, rx_desc, skb)) + if (xdp_res || ixgbe_cleanup_headers(rx_ring, rx_desc, skb)) continue; /* probably a little skewed due to removing CRC */ diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h index 9b37f354d78c..4384e892f967 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h @@ -443,7 +443,6 @@ extern const struct ixgbevf_info ixgbevf_X540_vf_hv_info; extern const struct ixgbevf_info ixgbevf_X550_vf_hv_info; extern const struct ixgbevf_info ixgbevf_X550EM_x_vf_hv_info; extern const struct ixgbevf_info ixgbevf_e610_vf_hv_info; -extern const struct ixgbe_mbx_operations ixgbevf_hv_mbx_ops; /* needed by ethtool.c */ extern const char ixgbevf_driver_name[]; diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 2829bac9af94..6442f115a262 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -737,10 +737,6 @@ static bool ixgbevf_cleanup_headers(struct ixgbevf_ring *rx_ring, union ixgbe_adv_rx_desc *rx_desc, struct sk_buff *skb) { - /* XDP packets use error pointer so abort at this point */ - if (IS_ERR(skb)) - return true; - /* verify that the packet does not have any known errors */ if (unlikely(ixgbevf_test_staterr(rx_desc, IXGBE_RXDADV_ERR_FRAME_ERR_MASK))) { @@ -1049,9 +1045,9 @@ static int ixgbevf_xmit_xdp_ring(struct ixgbevf_ring *ring, return IXGBEVF_XDP_TX; } -static struct sk_buff *ixgbevf_run_xdp(struct ixgbevf_adapter *adapter, - struct ixgbevf_ring *rx_ring, - struct xdp_buff *xdp) +static int ixgbevf_run_xdp(struct ixgbevf_adapter *adapter, + struct ixgbevf_ring *rx_ring, + struct xdp_buff *xdp) { int result = IXGBEVF_XDP_PASS; struct ixgbevf_ring *xdp_ring; @@ -1085,7 +1081,7 @@ out_failure: break; } xdp_out: - return ERR_PTR(-result); + return result; } static unsigned int ixgbevf_rx_frame_truesize(struct ixgbevf_ring *rx_ring, @@ -1127,6 +1123,7 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, struct sk_buff *skb = rx_ring->skb; bool xdp_xmit = false; struct xdp_buff xdp; + int xdp_res = 0; /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ #if (PAGE_SIZE < 8192) @@ -1170,11 +1167,11 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, /* At larger PAGE_SIZE, frame_sz depend on len size */ xdp.frame_sz = ixgbevf_rx_frame_truesize(rx_ring, size); #endif - skb = ixgbevf_run_xdp(adapter, rx_ring, &xdp); + xdp_res = ixgbevf_run_xdp(adapter, rx_ring, &xdp); } - if (IS_ERR(skb)) { - if (PTR_ERR(skb) == -IXGBEVF_XDP_TX) { + if (xdp_res) { + if (xdp_res == IXGBEVF_XDP_TX) { xdp_xmit = true; ixgbevf_rx_buffer_flip(rx_ring, rx_buffer, size); @@ -1194,7 +1191,7 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, } /* exit if we failed to retrieve a buffer */ - if (!skb) { + if (!xdp_res && !skb) { rx_ring->rx_stats.alloc_rx_buff_failed++; rx_buffer->pagecnt_bias++; break; @@ -1208,7 +1205,7 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, continue; /* verify the packet layout is correct */ - if (ixgbevf_cleanup_headers(rx_ring, rx_desc, skb)) { + if (xdp_res || ixgbevf_cleanup_headers(rx_ring, rx_desc, skb)) { skb = NULL; continue; } diff --git a/drivers/net/ethernet/intel/ixgbevf/mbx.c b/drivers/net/ethernet/intel/ixgbevf/mbx.c index a55dd978f7ca..24d0237e7a99 100644 --- a/drivers/net/ethernet/intel/ixgbevf/mbx.c +++ b/drivers/net/ethernet/intel/ixgbevf/mbx.c @@ -505,15 +505,3 @@ const struct ixgbe_mbx_operations ixgbevf_mbx_ops_legacy = { .check_for_ack = ixgbevf_check_for_ack_vf, .check_for_rst = ixgbevf_check_for_rst_vf, }; - -/* Mailbox operations when running on Hyper-V. - * On Hyper-V, PF/VF communication is not through the - * hardware mailbox; this communication is through - * a software mediated path. - * Most mail box operations are noop while running on - * Hyper-V. - */ -const struct ixgbe_mbx_operations ixgbevf_hv_mbx_ops = { - .init_params = ixgbevf_init_mbx_params_vf, - .check_for_rst = ixgbevf_check_for_rst_vf, -}; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c index 6cc7a78968fc..f3b9daffaec3 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c @@ -133,9 +133,7 @@ static const char *rsrc_name(enum mcs_rsrc_type rsrc_type) return "SA"; default: return "Unknown"; - }; - - return "Unknown"; + } } static int cn10k_mcs_alloc_rsrc(struct otx2_nic *pfvf, enum mcs_direction dir, diff --git a/drivers/net/ethernet/mediatek/airoha_eth.c b/drivers/net/ethernet/mediatek/airoha_eth.c index d8bfc21a5b19..a30c417d66f2 100644 --- a/drivers/net/ethernet/mediatek/airoha_eth.c +++ b/drivers/net/ethernet/mediatek/airoha_eth.c @@ -15,6 +15,7 @@ #include <linux/u64_stats_sync.h> #include <net/dsa.h> #include <net/page_pool/helpers.h> +#include <net/pkt_cls.h> #include <uapi/linux/ppp_defs.h> #define AIROHA_MAX_NUM_GDM_PORTS 1 @@ -23,8 +24,12 @@ #define AIROHA_MAX_NUM_XSI_RSTS 5 #define AIROHA_MAX_MTU 2000 #define AIROHA_MAX_PACKET_SIZE 2048 +#define AIROHA_NUM_QOS_CHANNELS 4 +#define AIROHA_NUM_QOS_QUEUES 8 #define AIROHA_NUM_TX_RING 32 #define AIROHA_NUM_RX_RING 32 +#define AIROHA_NUM_NETDEV_TX_RINGS (AIROHA_NUM_TX_RING + \ + AIROHA_NUM_QOS_CHANNELS) #define AIROHA_FE_MC_MAX_VLAN_TABLE 64 #define AIROHA_FE_MC_MAX_VLAN_PORT 16 #define AIROHA_NUM_TX_IRQ 2 @@ -40,6 +45,9 @@ #define PSE_RSV_PAGES 128 #define PSE_QUEUE_RSV_PAGES 64 +#define QDMA_METER_IDX(_n) ((_n) & 0xff) +#define QDMA_METER_GROUP(_n) (((_n) >> 8) & 0x3) + /* FE */ #define PSE_BASE 0x0100 #define CSR_IFC_BASE 0x0200 @@ -541,9 +549,24 @@ #define INGRESS_SLOW_TICK_RATIO_MASK GENMASK(29, 16) #define INGRESS_FAST_TICK_MASK GENMASK(15, 0) +#define REG_QUEUE_CLOSE_CFG(_n) (0x00a0 + ((_n) & 0xfc)) +#define TXQ_DISABLE_CHAN_QUEUE_MASK(_n, _m) BIT((_m) + (((_n) & 0x3) << 3)) + #define REG_TXQ_DIS_CFG_BASE(_n) ((_n) ? 0x20a0 : 0x00a0) #define REG_TXQ_DIS_CFG(_n, _m) (REG_TXQ_DIS_CFG_BASE((_n)) + (_m) << 2) +#define REG_CNTR_CFG(_n) (0x0400 + ((_n) << 3)) +#define CNTR_EN_MASK BIT(31) +#define CNTR_ALL_CHAN_EN_MASK BIT(30) +#define CNTR_ALL_QUEUE_EN_MASK BIT(29) +#define CNTR_ALL_DSCP_RING_EN_MASK BIT(28) +#define CNTR_SRC_MASK GENMASK(27, 24) +#define CNTR_DSCP_RING_MASK GENMASK(20, 16) +#define CNTR_CHAN_MASK GENMASK(7, 3) +#define CNTR_QUEUE_MASK GENMASK(2, 0) + +#define REG_CNTR_VAL(_n) (0x0404 + ((_n) << 3)) + #define REG_LMGR_INIT_CFG 0x1000 #define LMGR_INIT_START BIT(31) #define LMGR_SRAM_MODE_MASK BIT(30) @@ -565,13 +588,34 @@ #define EGRESS_SLOW_TICK_RATIO_MASK GENMASK(29, 16) #define EGRESS_FAST_TICK_MASK GENMASK(15, 0) +#define TRTCM_PARAM_RW_MASK BIT(31) +#define TRTCM_PARAM_RW_DONE_MASK BIT(30) +#define TRTCM_PARAM_TYPE_MASK GENMASK(29, 28) +#define TRTCM_METER_GROUP_MASK GENMASK(27, 26) +#define TRTCM_PARAM_INDEX_MASK GENMASK(23, 17) +#define TRTCM_PARAM_RATE_TYPE_MASK BIT(16) + +#define REG_TRTCM_CFG_PARAM(_n) ((_n) + 0x4) +#define REG_TRTCM_DATA_LOW(_n) ((_n) + 0x8) +#define REG_TRTCM_DATA_HIGH(_n) ((_n) + 0xc) + #define REG_TXWRR_MODE_CFG 0x1020 #define TWRR_WEIGHT_SCALE_MASK BIT(31) #define TWRR_WEIGHT_BASE_MASK BIT(3) +#define REG_TXWRR_WEIGHT_CFG 0x1024 +#define TWRR_RW_CMD_MASK BIT(31) +#define TWRR_RW_CMD_DONE BIT(30) +#define TWRR_CHAN_IDX_MASK GENMASK(23, 19) +#define TWRR_QUEUE_IDX_MASK GENMASK(18, 16) +#define TWRR_VALUE_MASK GENMASK(15, 0) + #define REG_PSE_BUF_USAGE_CFG 0x1028 #define PSE_BUF_ESTIMATE_EN_MASK BIT(29) +#define REG_CHAN_QOS_MODE(_n) (0x1040 + ((_n) << 2)) +#define CHAN_QOS_MODE_MASK(_n) GENMASK(2 + ((_n) << 2), (_n) << 2) + #define REG_GLB_TRTCM_CFG 0x1080 #define GLB_TRTCM_EN_MASK BIT(31) #define GLB_TRTCM_MODE_MASK BIT(30) @@ -720,6 +764,40 @@ enum { FE_PSE_PORT_DROP = 0xf, }; +enum tx_sched_mode { + TC_SCH_WRR8, + TC_SCH_SP, + TC_SCH_WRR7, + TC_SCH_WRR6, + TC_SCH_WRR5, + TC_SCH_WRR4, + TC_SCH_WRR3, + TC_SCH_WRR2, +}; + +enum trtcm_param_type { + TRTCM_MISC_MODE, /* meter_en, pps_mode, tick_sel */ + TRTCM_TOKEN_RATE_MODE, + TRTCM_BUCKETSIZE_SHIFT_MODE, + TRTCM_BUCKET_COUNTER_MODE, +}; + +enum trtcm_mode_type { + TRTCM_COMMIT_MODE, + TRTCM_PEAK_MODE, +}; + +enum trtcm_param { + TRTCM_TICK_SEL = BIT(0), + TRTCM_PKT_MODE = BIT(1), + TRTCM_METER_MODE = BIT(2), +}; + +#define MIN_TOKEN_SIZE 4096 +#define MAX_TOKEN_SIZE_OFFSET 17 +#define TRTCM_TOKEN_RATE_MASK GENMASK(23, 6) +#define TRTCM_TOKEN_RATE_FRACTION_MASK GENMASK(5, 0) + struct airoha_queue_entry { union { void *buf; @@ -810,6 +888,12 @@ struct airoha_gdm_port { int id; struct airoha_hw_stats stats; + + DECLARE_BITMAP(qos_sq_bmap, AIROHA_NUM_QOS_CHANNELS); + + /* qos stats counters */ + u64 cpu_tx_packets; + u64 fwd_tx_packets; }; struct airoha_eth { @@ -1789,6 +1873,10 @@ static int airoha_qdma_init_tx_queue(struct airoha_queue *q, WRITE_ONCE(q->desc[i].ctrl, cpu_to_le32(val)); } + /* xmit ring drop default setting */ + airoha_qdma_set(qdma, REG_TX_RING_BLOCKING(qid), + TX_RING_IRQ_BLOCKING_TX_DROP_EN_MASK); + airoha_qdma_wr(qdma, REG_TX_RING_BASE(qid), dma_addr); airoha_qdma_rmw(qdma, REG_TX_CPU_IDX(qid), TX_RING_CPU_IDX_MASK, FIELD_PREP(TX_RING_CPU_IDX_MASK, q->head)); @@ -1955,6 +2043,27 @@ static void airoha_qdma_init_qos(struct airoha_qdma *qdma) FIELD_PREP(SLA_SLOW_TICK_RATIO_MASK, 40)); } +static void airoha_qdma_init_qos_stats(struct airoha_qdma *qdma) +{ + int i; + + for (i = 0; i < AIROHA_NUM_QOS_CHANNELS; i++) { + /* Tx-cpu transferred count */ + airoha_qdma_wr(qdma, REG_CNTR_VAL(i << 1), 0); + airoha_qdma_wr(qdma, REG_CNTR_CFG(i << 1), + CNTR_EN_MASK | CNTR_ALL_QUEUE_EN_MASK | + CNTR_ALL_DSCP_RING_EN_MASK | + FIELD_PREP(CNTR_CHAN_MASK, i)); + /* Tx-fwd transferred count */ + airoha_qdma_wr(qdma, REG_CNTR_VAL((i << 1) + 1), 0); + airoha_qdma_wr(qdma, REG_CNTR_CFG(i << 1), + CNTR_EN_MASK | CNTR_ALL_QUEUE_EN_MASK | + CNTR_ALL_DSCP_RING_EN_MASK | + FIELD_PREP(CNTR_SRC_MASK, 1) | + FIELD_PREP(CNTR_CHAN_MASK, i)); + } +} + static int airoha_qdma_hw_init(struct airoha_qdma *qdma) { int i; @@ -2005,6 +2114,7 @@ static int airoha_qdma_hw_init(struct airoha_qdma *qdma) airoha_qdma_set(qdma, REG_TXQ_CNGST_CFG, TXQ_CNGST_DROP_EN | TXQ_CNGST_DEI_DROP_EN); + airoha_qdma_init_qos_stats(qdma); return 0; } @@ -2425,21 +2535,44 @@ static void airoha_dev_get_stats64(struct net_device *dev, } while (u64_stats_fetch_retry(&port->stats.syncp, start)); } +static u16 airoha_dev_select_queue(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) +{ + struct airoha_gdm_port *port = netdev_priv(dev); + int queue, channel; + + /* For dsa device select QoS channel according to the dsa user port + * index, rely on port id otherwise. Select QoS queue based on the + * skb priority. + */ + channel = netdev_uses_dsa(dev) ? skb_get_queue_mapping(skb) : port->id; + channel = channel % AIROHA_NUM_QOS_CHANNELS; + queue = (skb->priority - 1) % AIROHA_NUM_QOS_QUEUES; /* QoS queue */ + queue = channel * AIROHA_NUM_QOS_QUEUES + queue; + + return queue < dev->num_tx_queues ? queue : 0; +} + static netdev_tx_t airoha_dev_xmit(struct sk_buff *skb, struct net_device *dev) { struct skb_shared_info *sinfo = skb_shinfo(skb); struct airoha_gdm_port *port = netdev_priv(dev); - u32 msg0 = 0, msg1, len = skb_headlen(skb); - int i, qid = skb_get_queue_mapping(skb); + u32 msg0, msg1, len = skb_headlen(skb); struct airoha_qdma *qdma = port->qdma; u32 nr_frags = 1 + sinfo->nr_frags; struct netdev_queue *txq; struct airoha_queue *q; void *data = skb->data; + int i, qid; u16 index; u8 fport; + qid = skb_get_queue_mapping(skb) % ARRAY_SIZE(qdma->q_tx); + msg0 = FIELD_PREP(QDMA_ETH_TXMSG_CHAN_MASK, + qid / AIROHA_NUM_QOS_QUEUES) | + FIELD_PREP(QDMA_ETH_TXMSG_QUEUE_MASK, + qid % AIROHA_NUM_QOS_QUEUES); if (skb->ip_summed == CHECKSUM_PARTIAL) msg0 |= FIELD_PREP(QDMA_ETH_TXMSG_TCO_MASK, 1) | FIELD_PREP(QDMA_ETH_TXMSG_UCO_MASK, 1) | @@ -2609,13 +2742,386 @@ airoha_ethtool_get_rmon_stats(struct net_device *dev, } while (u64_stats_fetch_retry(&port->stats.syncp, start)); } +static int airoha_qdma_set_chan_tx_sched(struct airoha_gdm_port *port, + int channel, enum tx_sched_mode mode, + const u16 *weights, u8 n_weights) +{ + int i; + + for (i = 0; i < AIROHA_NUM_TX_RING; i++) + airoha_qdma_clear(port->qdma, REG_QUEUE_CLOSE_CFG(channel), + TXQ_DISABLE_CHAN_QUEUE_MASK(channel, i)); + + for (i = 0; i < n_weights; i++) { + u32 status; + int err; + + airoha_qdma_wr(port->qdma, REG_TXWRR_WEIGHT_CFG, + TWRR_RW_CMD_MASK | + FIELD_PREP(TWRR_CHAN_IDX_MASK, channel) | + FIELD_PREP(TWRR_QUEUE_IDX_MASK, i) | + FIELD_PREP(TWRR_VALUE_MASK, weights[i])); + err = read_poll_timeout(airoha_qdma_rr, status, + status & TWRR_RW_CMD_DONE, + USEC_PER_MSEC, 10 * USEC_PER_MSEC, + true, port->qdma, + REG_TXWRR_WEIGHT_CFG); + if (err) + return err; + } + + airoha_qdma_rmw(port->qdma, REG_CHAN_QOS_MODE(channel >> 3), + CHAN_QOS_MODE_MASK(channel), + mode << __ffs(CHAN_QOS_MODE_MASK(channel))); + + return 0; +} + +static int airoha_qdma_set_tx_prio_sched(struct airoha_gdm_port *port, + int channel) +{ + static const u16 w[AIROHA_NUM_QOS_QUEUES] = {}; + + return airoha_qdma_set_chan_tx_sched(port, channel, TC_SCH_SP, w, + ARRAY_SIZE(w)); +} + +static int airoha_qdma_set_tx_ets_sched(struct airoha_gdm_port *port, + int channel, + struct tc_ets_qopt_offload *opt) +{ + struct tc_ets_qopt_offload_replace_params *p = &opt->replace_params; + enum tx_sched_mode mode = TC_SCH_SP; + u16 w[AIROHA_NUM_QOS_QUEUES] = {}; + int i, nstrict = 0; + + if (p->bands > AIROHA_NUM_QOS_QUEUES) + return -EINVAL; + + for (i = 0; i < p->bands; i++) { + if (!p->quanta[i]) + nstrict++; + } + + /* this configuration is not supported by the hw */ + if (nstrict == AIROHA_NUM_QOS_QUEUES - 1) + return -EINVAL; + + for (i = 0; i < p->bands - nstrict; i++) + w[i] = p->weights[nstrict + i]; + + if (!nstrict) + mode = TC_SCH_WRR8; + else if (nstrict < AIROHA_NUM_QOS_QUEUES - 1) + mode = nstrict + 1; + + return airoha_qdma_set_chan_tx_sched(port, channel, mode, w, + ARRAY_SIZE(w)); +} + +static int airoha_qdma_get_tx_ets_stats(struct airoha_gdm_port *port, + int channel, + struct tc_ets_qopt_offload *opt) +{ + u64 cpu_tx_packets = airoha_qdma_rr(port->qdma, + REG_CNTR_VAL(channel << 1)); + u64 fwd_tx_packets = airoha_qdma_rr(port->qdma, + REG_CNTR_VAL((channel << 1) + 1)); + u64 tx_packets = (cpu_tx_packets - port->cpu_tx_packets) + + (fwd_tx_packets - port->fwd_tx_packets); + _bstats_update(opt->stats.bstats, 0, tx_packets); + + port->cpu_tx_packets = cpu_tx_packets; + port->fwd_tx_packets = fwd_tx_packets; + + return 0; +} + +static int airoha_tc_setup_qdisc_ets(struct airoha_gdm_port *port, + struct tc_ets_qopt_offload *opt) +{ + int channel; + + if (opt->parent == TC_H_ROOT) + return -EINVAL; + + channel = TC_H_MAJ(opt->handle) >> 16; + channel = channel % AIROHA_NUM_QOS_CHANNELS; + + switch (opt->command) { + case TC_ETS_REPLACE: + return airoha_qdma_set_tx_ets_sched(port, channel, opt); + case TC_ETS_DESTROY: + /* PRIO is default qdisc scheduler */ + return airoha_qdma_set_tx_prio_sched(port, channel); + case TC_ETS_STATS: + return airoha_qdma_get_tx_ets_stats(port, channel, opt); + default: + return -EOPNOTSUPP; + } +} + +static int airoha_qdma_get_trtcm_param(struct airoha_qdma *qdma, int channel, + u32 addr, enum trtcm_param_type param, + enum trtcm_mode_type mode, + u32 *val_low, u32 *val_high) +{ + u32 idx = QDMA_METER_IDX(channel), group = QDMA_METER_GROUP(channel); + u32 val, config = FIELD_PREP(TRTCM_PARAM_TYPE_MASK, param) | + FIELD_PREP(TRTCM_METER_GROUP_MASK, group) | + FIELD_PREP(TRTCM_PARAM_INDEX_MASK, idx) | + FIELD_PREP(TRTCM_PARAM_RATE_TYPE_MASK, mode); + + airoha_qdma_wr(qdma, REG_TRTCM_CFG_PARAM(addr), config); + if (read_poll_timeout(airoha_qdma_rr, val, + val & TRTCM_PARAM_RW_DONE_MASK, + USEC_PER_MSEC, 10 * USEC_PER_MSEC, true, + qdma, REG_TRTCM_CFG_PARAM(addr))) + return -ETIMEDOUT; + + *val_low = airoha_qdma_rr(qdma, REG_TRTCM_DATA_LOW(addr)); + if (val_high) + *val_high = airoha_qdma_rr(qdma, REG_TRTCM_DATA_HIGH(addr)); + + return 0; +} + +static int airoha_qdma_set_trtcm_param(struct airoha_qdma *qdma, int channel, + u32 addr, enum trtcm_param_type param, + enum trtcm_mode_type mode, u32 val) +{ + u32 idx = QDMA_METER_IDX(channel), group = QDMA_METER_GROUP(channel); + u32 config = TRTCM_PARAM_RW_MASK | + FIELD_PREP(TRTCM_PARAM_TYPE_MASK, param) | + FIELD_PREP(TRTCM_METER_GROUP_MASK, group) | + FIELD_PREP(TRTCM_PARAM_INDEX_MASK, idx) | + FIELD_PREP(TRTCM_PARAM_RATE_TYPE_MASK, mode); + + airoha_qdma_wr(qdma, REG_TRTCM_DATA_LOW(addr), val); + airoha_qdma_wr(qdma, REG_TRTCM_CFG_PARAM(addr), config); + + return read_poll_timeout(airoha_qdma_rr, val, + val & TRTCM_PARAM_RW_DONE_MASK, + USEC_PER_MSEC, 10 * USEC_PER_MSEC, true, + qdma, REG_TRTCM_CFG_PARAM(addr)); +} + +static int airoha_qdma_set_trtcm_config(struct airoha_qdma *qdma, int channel, + u32 addr, enum trtcm_mode_type mode, + bool enable, u32 enable_mask) +{ + u32 val; + + if (airoha_qdma_get_trtcm_param(qdma, channel, addr, TRTCM_MISC_MODE, + mode, &val, NULL)) + return -EINVAL; + + val = enable ? val | enable_mask : val & ~enable_mask; + + return airoha_qdma_set_trtcm_param(qdma, channel, addr, TRTCM_MISC_MODE, + mode, val); +} + +static int airoha_qdma_set_trtcm_token_bucket(struct airoha_qdma *qdma, + int channel, u32 addr, + enum trtcm_mode_type mode, + u32 rate_val, u32 bucket_size) +{ + u32 val, config, tick, unit, rate, rate_frac; + int err; + + if (airoha_qdma_get_trtcm_param(qdma, channel, addr, TRTCM_MISC_MODE, + mode, &config, NULL)) + return -EINVAL; + + val = airoha_qdma_rr(qdma, addr); + tick = FIELD_GET(INGRESS_FAST_TICK_MASK, val); + if (config & TRTCM_TICK_SEL) + tick *= FIELD_GET(INGRESS_SLOW_TICK_RATIO_MASK, val); + if (!tick) + return -EINVAL; + + unit = (config & TRTCM_PKT_MODE) ? 1000000 / tick : 8000 / tick; + if (!unit) + return -EINVAL; + + rate = rate_val / unit; + rate_frac = rate_val % unit; + rate_frac = FIELD_PREP(TRTCM_TOKEN_RATE_MASK, rate_frac) / unit; + rate = FIELD_PREP(TRTCM_TOKEN_RATE_MASK, rate) | + FIELD_PREP(TRTCM_TOKEN_RATE_FRACTION_MASK, rate_frac); + + err = airoha_qdma_set_trtcm_param(qdma, channel, addr, + TRTCM_TOKEN_RATE_MODE, mode, rate); + if (err) + return err; + + val = max_t(u32, bucket_size, MIN_TOKEN_SIZE); + val = min_t(u32, __fls(val), MAX_TOKEN_SIZE_OFFSET); + + return airoha_qdma_set_trtcm_param(qdma, channel, addr, + TRTCM_BUCKETSIZE_SHIFT_MODE, + mode, val); +} + +static int airoha_qdma_set_tx_rate_limit(struct airoha_gdm_port *port, + int channel, u32 rate, + u32 bucket_size) +{ + int i, err; + + for (i = 0; i <= TRTCM_PEAK_MODE; i++) { + err = airoha_qdma_set_trtcm_config(port->qdma, channel, + REG_EGRESS_TRTCM_CFG, i, + !!rate, TRTCM_METER_MODE); + if (err) + return err; + + err = airoha_qdma_set_trtcm_token_bucket(port->qdma, channel, + REG_EGRESS_TRTCM_CFG, + i, rate, bucket_size); + if (err) + return err; + } + + return 0; +} + +static int airoha_tc_htb_alloc_leaf_queue(struct airoha_gdm_port *port, + struct tc_htb_qopt_offload *opt) +{ + u32 channel = TC_H_MIN(opt->classid) % AIROHA_NUM_QOS_CHANNELS; + u32 rate = div_u64(opt->rate, 1000) << 3; /* kbps */ + struct net_device *dev = port->dev; + int num_tx_queues = dev->real_num_tx_queues; + int err; + + if (opt->parent_classid != TC_HTB_CLASSID_ROOT) { + NL_SET_ERR_MSG_MOD(opt->extack, "invalid parent classid"); + return -EINVAL; + } + + err = airoha_qdma_set_tx_rate_limit(port, channel, rate, opt->quantum); + if (err) { + NL_SET_ERR_MSG_MOD(opt->extack, + "failed configuring htb offload"); + return err; + } + + if (opt->command == TC_HTB_NODE_MODIFY) + return 0; + + err = netif_set_real_num_tx_queues(dev, num_tx_queues + 1); + if (err) { + airoha_qdma_set_tx_rate_limit(port, channel, 0, opt->quantum); + NL_SET_ERR_MSG_MOD(opt->extack, + "failed setting real_num_tx_queues"); + return err; + } + + set_bit(channel, port->qos_sq_bmap); + opt->qid = AIROHA_NUM_TX_RING + channel; + + return 0; +} + +static void airoha_tc_remove_htb_queue(struct airoha_gdm_port *port, int queue) +{ + struct net_device *dev = port->dev; + + netif_set_real_num_tx_queues(dev, dev->real_num_tx_queues - 1); + airoha_qdma_set_tx_rate_limit(port, queue + 1, 0, 0); + clear_bit(queue, port->qos_sq_bmap); +} + +static int airoha_tc_htb_delete_leaf_queue(struct airoha_gdm_port *port, + struct tc_htb_qopt_offload *opt) +{ + u32 channel = TC_H_MIN(opt->classid) % AIROHA_NUM_QOS_CHANNELS; + + if (!test_bit(channel, port->qos_sq_bmap)) { + NL_SET_ERR_MSG_MOD(opt->extack, "invalid queue id"); + return -EINVAL; + } + + airoha_tc_remove_htb_queue(port, channel); + + return 0; +} + +static int airoha_tc_htb_destroy(struct airoha_gdm_port *port) +{ + int q; + + for_each_set_bit(q, port->qos_sq_bmap, AIROHA_NUM_QOS_CHANNELS) + airoha_tc_remove_htb_queue(port, q); + + return 0; +} + +static int airoha_tc_get_htb_get_leaf_queue(struct airoha_gdm_port *port, + struct tc_htb_qopt_offload *opt) +{ + u32 channel = TC_H_MIN(opt->classid) % AIROHA_NUM_QOS_CHANNELS; + + if (!test_bit(channel, port->qos_sq_bmap)) { + NL_SET_ERR_MSG_MOD(opt->extack, "invalid queue id"); + return -EINVAL; + } + + opt->qid = channel; + + return 0; +} + +static int airoha_tc_setup_qdisc_htb(struct airoha_gdm_port *port, + struct tc_htb_qopt_offload *opt) +{ + switch (opt->command) { + case TC_HTB_CREATE: + break; + case TC_HTB_DESTROY: + return airoha_tc_htb_destroy(port); + case TC_HTB_NODE_MODIFY: + case TC_HTB_LEAF_ALLOC_QUEUE: + return airoha_tc_htb_alloc_leaf_queue(port, opt); + case TC_HTB_LEAF_DEL: + case TC_HTB_LEAF_DEL_LAST: + case TC_HTB_LEAF_DEL_LAST_FORCE: + return airoha_tc_htb_delete_leaf_queue(port, opt); + case TC_HTB_LEAF_QUERY_QUEUE: + return airoha_tc_get_htb_get_leaf_queue(port, opt); + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int airoha_dev_tc_setup(struct net_device *dev, enum tc_setup_type type, + void *type_data) +{ + struct airoha_gdm_port *port = netdev_priv(dev); + + switch (type) { + case TC_SETUP_QDISC_ETS: + return airoha_tc_setup_qdisc_ets(port, type_data); + case TC_SETUP_QDISC_HTB: + return airoha_tc_setup_qdisc_htb(port, type_data); + default: + return -EOPNOTSUPP; + } +} + static const struct net_device_ops airoha_netdev_ops = { .ndo_init = airoha_dev_init, .ndo_open = airoha_dev_open, .ndo_stop = airoha_dev_stop, + .ndo_select_queue = airoha_dev_select_queue, .ndo_start_xmit = airoha_dev_xmit, .ndo_get_stats64 = airoha_dev_get_stats64, .ndo_set_mac_address = airoha_dev_set_macaddr, + .ndo_setup_tc = airoha_dev_tc_setup, }; static const struct ethtool_ops airoha_ethtool_ops = { @@ -2652,7 +3158,8 @@ static int airoha_alloc_gdm_port(struct airoha_eth *eth, struct device_node *np) } dev = devm_alloc_etherdev_mqs(eth->dev, sizeof(*port), - AIROHA_NUM_TX_RING, AIROHA_NUM_RX_RING); + AIROHA_NUM_NETDEV_TX_RINGS, + AIROHA_NUM_RX_RING); if (!dev) { dev_err(eth->dev, "alloc_etherdev failed\n"); return -ENOMEM; @@ -2665,12 +3172,18 @@ static int airoha_alloc_gdm_port(struct airoha_eth *eth, struct device_node *np) dev->watchdog_timeo = 5 * HZ; dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_RXCSUM | NETIF_F_TSO6 | NETIF_F_IPV6_CSUM | - NETIF_F_SG | NETIF_F_TSO; + NETIF_F_SG | NETIF_F_TSO | + NETIF_F_HW_TC; dev->features |= dev->hw_features; dev->dev.of_node = np; dev->irq = qdma->irq; SET_NETDEV_DEV(dev, eth->dev); + /* reserve hw queues for HTB offloading */ + err = netif_set_real_num_tx_queues(dev, AIROHA_NUM_TX_RING); + if (err) + return err; + err = of_get_ethdev_address(np, dev); if (err) { if (err == -EPROBE_DEFER) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 10a763e668ed..d9a8817bb33c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -151,8 +151,9 @@ mlx5_core-$(CONFIG_MLX5_HW_STEERING) += steering/hws/cmd.o \ steering/hws/bwc.o \ steering/hws/debug.o \ steering/hws/vport.o \ - steering/hws/bwc_complex.o - + steering/hws/bwc_complex.o \ + steering/hws/fs_hws_pools.o \ + steering/hws/fs_hws.o # # SF device diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 6bd8a18e3af3..e733b81e18a2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -1013,6 +1013,7 @@ static void cmd_work_handler(struct work_struct *work) complete(&ent->done); } up(&cmd->vars.sem); + complete(&ent->slotted); return; } } else { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c index ca92e518be76..8489b0a0e8bd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c @@ -94,25 +94,14 @@ static bool mlx5e_ipsec_update_esn_state(struct mlx5e_ipsec_sa_entry *sa_entry) u32 esn, esn_msb; u8 overlap; - switch (x->xso.type) { - case XFRM_DEV_OFFLOAD_PACKET: - switch (x->xso.dir) { - case XFRM_DEV_OFFLOAD_IN: - esn = x->replay_esn->seq; - esn_msb = x->replay_esn->seq_hi; - break; - case XFRM_DEV_OFFLOAD_OUT: - esn = x->replay_esn->oseq; - esn_msb = x->replay_esn->oseq_hi; - break; - default: - WARN_ON(true); - return false; - } - break; - case XFRM_DEV_OFFLOAD_CRYPTO: - /* Already parsed by XFRM core */ + switch (x->xso.dir) { + case XFRM_DEV_OFFLOAD_IN: esn = x->replay_esn->seq; + esn_msb = x->replay_esn->seq_hi; + break; + case XFRM_DEV_OFFLOAD_OUT: + esn = x->replay_esn->oseq; + esn_msb = x->replay_esn->oseq_hi; break; default: WARN_ON(true); @@ -121,11 +110,15 @@ static bool mlx5e_ipsec_update_esn_state(struct mlx5e_ipsec_sa_entry *sa_entry) overlap = sa_entry->esn_state.overlap; - if (esn >= x->replay_esn->replay_window) - seq_bottom = esn - x->replay_esn->replay_window + 1; + if (!x->replay_esn->replay_window) { + seq_bottom = esn; + } else { + if (esn >= x->replay_esn->replay_window) + seq_bottom = esn - x->replay_esn->replay_window + 1; - if (x->xso.type == XFRM_DEV_OFFLOAD_CRYPTO) - esn_msb = xfrm_replay_seqhi(x, htonl(seq_bottom)); + if (x->xso.type == XFRM_DEV_OFFLOAD_CRYPTO) + esn_msb = xfrm_replay_seqhi(x, htonl(seq_bottom)); + } if (sa_entry->esn_state.esn_msb) sa_entry->esn_state.esn = esn; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 41b5e98a0495..f43fd96a680d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -3535,35 +3535,42 @@ static int mlx5_fs_mode_validate(struct devlink *devlink, u32 id, { struct mlx5_core_dev *dev = devlink_priv(devlink); char *value = val.vstr; - int err = 0; + u8 eswitch_mode; - if (!strcmp(value, "dmfs")) { + if (!strcmp(value, "dmfs")) return 0; - } else if (!strcmp(value, "smfs")) { - u8 eswitch_mode; - bool smfs_cap; - eswitch_mode = mlx5_eswitch_mode(dev); - smfs_cap = mlx5_fs_dr_is_supported(dev); + if (!strcmp(value, "smfs")) { + bool smfs_cap = mlx5_fs_dr_is_supported(dev); if (!smfs_cap) { - err = -EOPNOTSUPP; NL_SET_ERR_MSG_MOD(extack, "Software managed steering is not supported by current device"); + return -EOPNOTSUPP; } + } else if (!strcmp(value, "hmfs")) { + bool hmfs_cap = mlx5_fs_hws_is_supported(dev); - else if (eswitch_mode == MLX5_ESWITCH_OFFLOADS) { + if (!hmfs_cap) { NL_SET_ERR_MSG_MOD(extack, - "Software managed steering is not supported when eswitch offloads enabled."); - err = -EOPNOTSUPP; + "Hardware steering is not supported by current device"); + return -EOPNOTSUPP; } } else { NL_SET_ERR_MSG_MOD(extack, - "Bad parameter: supported values are [\"dmfs\", \"smfs\"]"); - err = -EINVAL; + "Bad parameter: supported values are [\"dmfs\", \"smfs\", \"hmfs\"]"); + return -EINVAL; } - return err; + eswitch_mode = mlx5_eswitch_mode(dev); + if (eswitch_mode == MLX5_ESWITCH_OFFLOADS) { + NL_SET_ERR_MSG_FMT_MOD(extack, + "Moving to %s is not supported when eswitch offloads enabled.", + value); + return -EOPNOTSUPP; + } + + return 0; } static int mlx5_fs_mode_set(struct devlink *devlink, u32 id, @@ -3575,6 +3582,8 @@ static int mlx5_fs_mode_set(struct devlink *devlink, u32 id, if (!strcmp(ctx->val.vstr, "smfs")) mode = MLX5_FLOW_STEERING_MODE_SMFS; + else if (!strcmp(ctx->val.vstr, "hmfs")) + mode = MLX5_FLOW_STEERING_MODE_HMFS; else mode = MLX5_FLOW_STEERING_MODE_DMFS; dev->priv.steering->mode = mode; @@ -3587,10 +3596,17 @@ static int mlx5_fs_mode_get(struct devlink *devlink, u32 id, { struct mlx5_core_dev *dev = devlink_priv(devlink); - if (dev->priv.steering->mode == MLX5_FLOW_STEERING_MODE_SMFS) + switch (dev->priv.steering->mode) { + case MLX5_FLOW_STEERING_MODE_SMFS: strscpy(ctx->val.vstr, "smfs", sizeof(ctx->val.vstr)); - else + break; + case MLX5_FLOW_STEERING_MODE_HMFS: + strscpy(ctx->val.vstr, "hmfs", sizeof(ctx->val.vstr)); + break; + default: strscpy(ctx->val.vstr, "dmfs", sizeof(ctx->val.vstr)); + } + return 0; } @@ -4009,6 +4025,8 @@ int mlx5_flow_namespace_set_mode(struct mlx5_flow_namespace *ns, if (mode == MLX5_FLOW_STEERING_MODE_SMFS) cmds = mlx5_fs_cmd_get_dr_cmds(); + else if (mode == MLX5_FLOW_STEERING_MODE_HMFS) + cmds = mlx5_fs_cmd_get_hws_cmds(); else cmds = mlx5_fs_cmd_get_fw_cmds(); if (!cmds) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index bad2df0715ec..20837e526679 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -38,6 +38,7 @@ #include <linux/rhashtable.h> #include <linux/llist.h> #include <steering/sws/fs_dr.h> +#include <steering/hws/fs_hws.h> #define FDB_TC_MAX_CHAIN 3 #define FDB_FT_CHAIN (FDB_TC_MAX_CHAIN + 1) @@ -64,6 +65,7 @@ struct mlx5_modify_hdr { enum mlx5_flow_resource_owner owner; union { struct mlx5_fs_dr_action fs_dr_action; + struct mlx5_fs_hws_action fs_hws_action; u32 id; }; }; @@ -74,6 +76,7 @@ struct mlx5_pkt_reformat { enum mlx5_flow_resource_owner owner; union { struct mlx5_fs_dr_action fs_dr_action; + struct mlx5_fs_hws_action fs_hws_action; u32 id; }; }; @@ -126,7 +129,8 @@ enum fs_fte_status { enum mlx5_flow_steering_mode { MLX5_FLOW_STEERING_MODE_DMFS, - MLX5_FLOW_STEERING_MODE_SMFS + MLX5_FLOW_STEERING_MODE_SMFS, + MLX5_FLOW_STEERING_MODE_HMFS, }; enum mlx5_flow_steering_capabilty { @@ -190,7 +194,10 @@ struct mlx5_flow_handle { /* Type of children is mlx5_flow_group */ struct mlx5_flow_table { struct fs_node node; - struct mlx5_fs_dr_table fs_dr_table; + union { + struct mlx5_fs_dr_table fs_dr_table; + struct mlx5_fs_hws_table fs_hws_table; + }; u32 id; u16 vport; unsigned int max_fte; @@ -247,7 +254,10 @@ struct fs_fte_dup { /* Type of children is mlx5_flow_rule */ struct fs_fte { struct fs_node node; - struct mlx5_fs_dr_rule fs_dr_rule; + union { + struct mlx5_fs_dr_rule fs_dr_rule; + struct mlx5_fs_hws_rule fs_hws_rule; + }; u32 val[MLX5_ST_SZ_DW_MATCH_PARAM]; struct fs_fte_action act_dests; struct fs_fte_dup *dup; @@ -280,7 +290,10 @@ struct mlx5_flow_group_mask { /* Type of children is fs_fte */ struct mlx5_flow_group { struct fs_node node; - struct mlx5_fs_dr_matcher fs_dr_matcher; + union { + struct mlx5_fs_dr_matcher fs_dr_matcher; + struct mlx5_fs_hws_matcher fs_hws_matcher; + }; struct mlx5_flow_group_mask mask; u32 start_index; u32 max_ftes; @@ -293,7 +306,10 @@ struct mlx5_flow_group { struct mlx5_flow_root_namespace { struct mlx5_flow_namespace ns; enum mlx5_flow_steering_mode mode; - struct mlx5_fs_dr_domain fs_dr_domain; + union { + struct mlx5_fs_dr_domain fs_dr_domain; + struct mlx5_fs_hws_context fs_hws_context; + }; enum fs_flow_table_type table_type; struct mlx5_core_dev *dev; struct mlx5_flow_table *root_ft; @@ -303,6 +319,42 @@ struct mlx5_flow_root_namespace { const struct mlx5_flow_cmds *cmds; }; +enum mlx5_fc_type { + MLX5_FC_TYPE_ACQUIRED = 0, + MLX5_FC_TYPE_LOCAL, +}; + +struct mlx5_fc_cache { + u64 packets; + u64 bytes; + u64 lastuse; +}; + +struct mlx5_fc { + u32 id; + bool aging; + enum mlx5_fc_type type; + struct mlx5_fc_bulk *bulk; + struct mlx5_fc_cache cache; + /* last{packets,bytes} are used for calculating deltas since last reading. */ + u64 lastpackets; + u64 lastbytes; +}; + +struct mlx5_fc_bulk_hws_data { + struct mlx5hws_action *hws_action; + struct mutex lock; /* protects hws_action */ + refcount_t hws_action_refcount; +}; + +struct mlx5_fc_bulk { + struct mlx5_fs_bulk fs_bulk; + u32 base_id; + struct mlx5_fc_bulk_hws_data hws_data; + struct mlx5_fc fcs[]; +}; + +u32 mlx5_fc_get_base_id(struct mlx5_fc *counter); int mlx5_init_fc_stats(struct mlx5_core_dev *dev); void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev); void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c index d8e1c4ebd364..492775d3d193 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c @@ -44,28 +44,6 @@ #define MLX5_FC_POOL_MAX_THRESHOLD BIT(18) #define MLX5_FC_POOL_USED_BUFF_RATIO 10 -enum mlx5_fc_type { - MLX5_FC_TYPE_ACQUIRED = 0, - MLX5_FC_TYPE_LOCAL, -}; - -struct mlx5_fc_cache { - u64 packets; - u64 bytes; - u64 lastuse; -}; - -struct mlx5_fc { - u32 id; - bool aging; - enum mlx5_fc_type type; - struct mlx5_fc_bulk *bulk; - struct mlx5_fc_cache cache; - /* last{packets,bytes} are used for calculating deltas since last reading. */ - u64 lastpackets; - u64 lastbytes; -}; - struct mlx5_fc_stats { struct xarray counters; @@ -434,13 +412,7 @@ void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev, fc_stats->sampling_interval); } -/* Flow counter bluks */ - -struct mlx5_fc_bulk { - struct mlx5_fs_bulk fs_bulk; - u32 base_id; - struct mlx5_fc fcs[]; -}; +/* Flow counter bulks */ static void mlx5_fc_init(struct mlx5_fc *counter, struct mlx5_fc_bulk *bulk, u32 id) @@ -449,7 +421,13 @@ static void mlx5_fc_init(struct mlx5_fc *counter, struct mlx5_fc_bulk *bulk, counter->id = id; } -static struct mlx5_fs_bulk *mlx5_fc_bulk_create(struct mlx5_core_dev *dev) +u32 mlx5_fc_get_base_id(struct mlx5_fc *counter) +{ + return counter->bulk->base_id; +} + +static struct mlx5_fs_bulk *mlx5_fc_bulk_create(struct mlx5_core_dev *dev, + void *pool_ctx) { enum mlx5_fc_bulk_alloc_bitmask alloc_bitmask; struct mlx5_fc_bulk *fc_bulk; @@ -473,6 +451,8 @@ static struct mlx5_fs_bulk *mlx5_fc_bulk_create(struct mlx5_core_dev *dev) for (i = 0; i < bulk_len; i++) mlx5_fc_init(&fc_bulk->fcs[i], fc_bulk, base_id + i); + refcount_set(&fc_bulk->hws_data.hws_action_refcount, 0); + mutex_init(&fc_bulk->hws_data.lock); return &fc_bulk->fs_bulk; fs_bulk_cleanup: @@ -518,7 +498,7 @@ static const struct mlx5_fs_pool_ops mlx5_fc_pool_ops = { static void mlx5_fc_pool_init(struct mlx5_fs_pool *fc_pool, struct mlx5_core_dev *dev) { - mlx5_fs_pool_init(fc_pool, dev, &mlx5_fc_pool_ops); + mlx5_fs_pool_init(fc_pool, dev, &mlx5_fc_pool_ops, NULL); } static void mlx5_fc_pool_cleanup(struct mlx5_fs_pool *fc_pool) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_pool.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_pool.c index b891d7b9e3e0..f6c226664602 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_pool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_pool.c @@ -56,11 +56,12 @@ static int mlx5_fs_bulk_release_index(struct mlx5_fs_bulk *fs_bulk, int index) } void mlx5_fs_pool_init(struct mlx5_fs_pool *pool, struct mlx5_core_dev *dev, - const struct mlx5_fs_pool_ops *ops) + const struct mlx5_fs_pool_ops *ops, void *pool_ctx) { WARN_ON_ONCE(!ops || !ops->bulk_destroy || !ops->bulk_create || !ops->update_threshold); pool->dev = dev; + pool->pool_ctx = pool_ctx; mutex_init(&pool->pool_lock); INIT_LIST_HEAD(&pool->fully_used); INIT_LIST_HEAD(&pool->partially_used); @@ -91,7 +92,7 @@ mlx5_fs_pool_alloc_new_bulk(struct mlx5_fs_pool *fs_pool) struct mlx5_core_dev *dev = fs_pool->dev; struct mlx5_fs_bulk *new_bulk; - new_bulk = fs_pool->ops->bulk_create(dev); + new_bulk = fs_pool->ops->bulk_create(dev, fs_pool->pool_ctx); if (new_bulk) fs_pool->available_units += new_bulk->bulk_len; fs_pool->ops->update_threshold(fs_pool); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_pool.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_pool.h index 3b149863260c..f04ec3107498 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_pool.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_pool.h @@ -21,7 +21,8 @@ struct mlx5_fs_pool; struct mlx5_fs_pool_ops { int (*bulk_destroy)(struct mlx5_core_dev *dev, struct mlx5_fs_bulk *bulk); - struct mlx5_fs_bulk * (*bulk_create)(struct mlx5_core_dev *dev); + struct mlx5_fs_bulk * (*bulk_create)(struct mlx5_core_dev *dev, + void *pool_ctx); void (*update_threshold)(struct mlx5_fs_pool *pool); }; @@ -44,7 +45,7 @@ void mlx5_fs_bulk_cleanup(struct mlx5_fs_bulk *fs_bulk); int mlx5_fs_bulk_get_free_amount(struct mlx5_fs_bulk *bulk); void mlx5_fs_pool_init(struct mlx5_fs_pool *pool, struct mlx5_core_dev *dev, - const struct mlx5_fs_pool_ops *ops); + const struct mlx5_fs_pool_ops *ops, void *pool_ctx); void mlx5_fs_pool_cleanup(struct mlx5_fs_pool *pool); int mlx5_fs_pool_acquire_index(struct mlx5_fs_pool *fs_pool, struct mlx5_fs_pool_index *pool_index); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index 4822d01123b4..d61a1a9297c9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -322,17 +322,16 @@ static void mlx5_pps_out(struct work_struct *work) } } -static void mlx5_timestamp_overflow(struct work_struct *work) +static long mlx5_timestamp_overflow(struct ptp_clock_info *ptp_info) { - struct delayed_work *dwork = to_delayed_work(work); struct mlx5_core_dev *mdev; struct mlx5_timer *timer; struct mlx5_clock *clock; unsigned long flags; - timer = container_of(dwork, struct mlx5_timer, overflow_work); - clock = container_of(timer, struct mlx5_clock, timer); + clock = container_of(ptp_info, struct mlx5_clock, ptp_info); mdev = container_of(clock, struct mlx5_core_dev, clock); + timer = &clock->timer; if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) goto out; @@ -343,7 +342,7 @@ static void mlx5_timestamp_overflow(struct work_struct *work) write_sequnlock_irqrestore(&clock->lock, flags); out: - schedule_delayed_work(&timer->overflow_work, timer->overflow_period); + return timer->overflow_period; } static int mlx5_ptp_settime_real_time(struct mlx5_core_dev *mdev, @@ -517,6 +516,7 @@ static int mlx5_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) timer->cycles.mult = mult; mlx5_update_clock_info_page(mdev); write_sequnlock_irqrestore(&clock->lock, flags); + ptp_schedule_worker(clock->ptp, timer->overflow_period); return 0; } @@ -852,6 +852,7 @@ static const struct ptp_clock_info mlx5_ptp_clock_info = { .settime64 = mlx5_ptp_settime, .enable = NULL, .verify = NULL, + .do_aux_work = mlx5_timestamp_overflow, }; static int mlx5_query_mtpps_pin_mode(struct mlx5_core_dev *mdev, u8 pin, @@ -1052,12 +1053,11 @@ static void mlx5_init_overflow_period(struct mlx5_clock *clock) do_div(ns, NSEC_PER_SEC / HZ); timer->overflow_period = ns; - INIT_DELAYED_WORK(&timer->overflow_work, mlx5_timestamp_overflow); - if (timer->overflow_period) - schedule_delayed_work(&timer->overflow_work, 0); - else + if (!timer->overflow_period) { + timer->overflow_period = HZ; mlx5_core_warn(mdev, - "invalid overflow period, overflow_work is not scheduled\n"); + "invalid overflow period, overflow_work is scheduled once per second\n"); + } if (clock_info) clock_info->overflow_period = timer->overflow_period; @@ -1172,6 +1172,9 @@ void mlx5_init_clock(struct mlx5_core_dev *mdev) MLX5_NB_INIT(&clock->pps_nb, mlx5_pps_event, PPS_EVENT); mlx5_eq_notifier_register(mdev, &clock->pps_nb); + + if (clock->ptp) + ptp_schedule_worker(clock->ptp, 0); } void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) @@ -1188,7 +1191,6 @@ void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) } cancel_work_sync(&clock->pps_info.out_work); - cancel_delayed_work_sync(&clock->timer.overflow_work); if (mdev->clock_info) { free_page((unsigned long)mdev->clock_info); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c index a897cdc60fdb..b5332c54d4fb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c @@ -11,31 +11,29 @@ /* This is the longest supported action sequence for FDB table: * DECAP, POP_VLAN, MODIFY, CTR, ASO, PUSH_VLAN, MODIFY, ENCAP, Term. */ -static const u32 action_order_arr[MLX5HWS_TABLE_TYPE_MAX][MLX5HWS_ACTION_TYP_MAX] = { - [MLX5HWS_TABLE_TYPE_FDB] = { - BIT(MLX5HWS_ACTION_TYP_REMOVE_HEADER) | - BIT(MLX5HWS_ACTION_TYP_REFORMAT_TNL_L2_TO_L2) | - BIT(MLX5HWS_ACTION_TYP_REFORMAT_TNL_L3_TO_L2), - BIT(MLX5HWS_ACTION_TYP_POP_VLAN), - BIT(MLX5HWS_ACTION_TYP_POP_VLAN), - BIT(MLX5HWS_ACTION_TYP_MODIFY_HDR), - BIT(MLX5HWS_ACTION_TYP_PUSH_VLAN), - BIT(MLX5HWS_ACTION_TYP_PUSH_VLAN), - BIT(MLX5HWS_ACTION_TYP_INSERT_HEADER) | - BIT(MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L2) | - BIT(MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L3), - BIT(MLX5HWS_ACTION_TYP_CTR), - BIT(MLX5HWS_ACTION_TYP_TAG), - BIT(MLX5HWS_ACTION_TYP_ASO_METER), - BIT(MLX5HWS_ACTION_TYP_MODIFY_HDR), - BIT(MLX5HWS_ACTION_TYP_TBL) | - BIT(MLX5HWS_ACTION_TYP_VPORT) | - BIT(MLX5HWS_ACTION_TYP_DROP) | - BIT(MLX5HWS_ACTION_TYP_SAMPLER) | - BIT(MLX5HWS_ACTION_TYP_RANGE) | - BIT(MLX5HWS_ACTION_TYP_DEST_ARRAY), - BIT(MLX5HWS_ACTION_TYP_LAST), - }, +static const u32 action_order_arr[MLX5HWS_ACTION_TYP_MAX] = { + BIT(MLX5HWS_ACTION_TYP_REMOVE_HEADER) | + BIT(MLX5HWS_ACTION_TYP_REFORMAT_TNL_L2_TO_L2) | + BIT(MLX5HWS_ACTION_TYP_REFORMAT_TNL_L3_TO_L2), + BIT(MLX5HWS_ACTION_TYP_POP_VLAN), + BIT(MLX5HWS_ACTION_TYP_POP_VLAN), + BIT(MLX5HWS_ACTION_TYP_MODIFY_HDR), + BIT(MLX5HWS_ACTION_TYP_PUSH_VLAN), + BIT(MLX5HWS_ACTION_TYP_PUSH_VLAN), + BIT(MLX5HWS_ACTION_TYP_INSERT_HEADER) | + BIT(MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L2) | + BIT(MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L3), + BIT(MLX5HWS_ACTION_TYP_CTR), + BIT(MLX5HWS_ACTION_TYP_TAG), + BIT(MLX5HWS_ACTION_TYP_ASO_METER), + BIT(MLX5HWS_ACTION_TYP_MODIFY_HDR), + BIT(MLX5HWS_ACTION_TYP_TBL) | + BIT(MLX5HWS_ACTION_TYP_VPORT) | + BIT(MLX5HWS_ACTION_TYP_DROP) | + BIT(MLX5HWS_ACTION_TYP_SAMPLER) | + BIT(MLX5HWS_ACTION_TYP_RANGE) | + BIT(MLX5HWS_ACTION_TYP_DEST_ARRAY), + BIT(MLX5HWS_ACTION_TYP_LAST), }; static const char * const mlx5hws_action_type_str[] = { @@ -83,8 +81,8 @@ static int hws_action_get_shared_stc_nic(struct mlx5hws_context *ctx, int ret; mutex_lock(&ctx->ctrl_lock); - if (ctx->common_res[tbl_type].shared_stc[stc_type]) { - ctx->common_res[tbl_type].shared_stc[stc_type]->refcount++; + if (ctx->common_res.shared_stc[stc_type]) { + ctx->common_res.shared_stc[stc_type]->refcount++; mutex_unlock(&ctx->ctrl_lock); return 0; } @@ -124,8 +122,8 @@ static int hws_action_get_shared_stc_nic(struct mlx5hws_context *ctx, goto free_shared_stc; } - ctx->common_res[tbl_type].shared_stc[stc_type] = shared_stc; - ctx->common_res[tbl_type].shared_stc[stc_type]->refcount = 1; + ctx->common_res.shared_stc[stc_type] = shared_stc; + ctx->common_res.shared_stc[stc_type]->refcount = 1; mutex_unlock(&ctx->ctrl_lock); @@ -178,16 +176,16 @@ static void hws_action_put_shared_stc(struct mlx5hws_action *action, } mutex_lock(&ctx->ctrl_lock); - if (--ctx->common_res[tbl_type].shared_stc[stc_type]->refcount) { + if (--ctx->common_res.shared_stc[stc_type]->refcount) { mutex_unlock(&ctx->ctrl_lock); return; } - shared_stc = ctx->common_res[tbl_type].shared_stc[stc_type]; + shared_stc = ctx->common_res.shared_stc[stc_type]; mlx5hws_action_free_single_stc(ctx, tbl_type, &shared_stc->stc_chunk); kfree(shared_stc); - ctx->common_res[tbl_type].shared_stc[stc_type] = NULL; + ctx->common_res.shared_stc[stc_type] = NULL; mutex_unlock(&ctx->ctrl_lock); } @@ -206,10 +204,10 @@ bool mlx5hws_action_check_combo(struct mlx5hws_context *ctx, enum mlx5hws_action_type *user_actions, enum mlx5hws_table_type table_type) { - const u32 *order_arr = action_order_arr[table_type]; + const u32 *order_arr = action_order_arr; + bool valid_combo; u8 order_idx = 0; u8 user_idx = 0; - bool valid_combo; if (table_type >= MLX5HWS_TABLE_TYPE_MAX) { mlx5hws_err(ctx, "Invalid table_type %d", table_type); @@ -321,8 +319,8 @@ int mlx5hws_action_alloc_single_stc(struct mlx5hws_context *ctx, __must_hold(&ctx->ctrl_lock) { struct mlx5hws_cmd_stc_modify_attr cleanup_stc_attr = {0}; - struct mlx5hws_pool *stc_pool = ctx->stc_pool[table_type]; struct mlx5hws_cmd_stc_modify_attr fixup_stc_attr = {0}; + struct mlx5hws_pool *stc_pool = ctx->stc_pool; bool use_fixup; u32 obj_0_id; int ret; @@ -387,8 +385,8 @@ void mlx5hws_action_free_single_stc(struct mlx5hws_context *ctx, struct mlx5hws_pool_chunk *stc) __must_hold(&ctx->ctrl_lock) { - struct mlx5hws_pool *stc_pool = ctx->stc_pool[table_type]; struct mlx5hws_cmd_stc_modify_attr stc_attr = {0}; + struct mlx5hws_pool *stc_pool = ctx->stc_pool; u32 obj_id; /* Modify the STC not to point to an object */ @@ -473,6 +471,7 @@ static void hws_action_fill_stc_attr(struct mlx5hws_action *action, break; case MLX5HWS_ACTION_TYP_TBL: case MLX5HWS_ACTION_TYP_DEST_ARRAY: + case MLX5HWS_ACTION_TYP_SAMPLER: attr->action_type = MLX5_IFC_STC_ACTION_TYPE_JUMP_TO_FT; attr->action_offset = MLX5HWS_ACTION_OFFSET_HIT; attr->dest_table_id = obj_id; @@ -561,7 +560,7 @@ hws_action_create_stcs(struct mlx5hws_action *action, u32 obj_id) if (action->flags & MLX5HWS_ACTION_FLAG_HWS_FDB) { ret = mlx5hws_action_alloc_single_stc(ctx, &stc_attr, MLX5HWS_TABLE_TYPE_FDB, - &action->stc[MLX5HWS_TABLE_TYPE_FDB]); + &action->stc); if (ret) goto out_err; } @@ -585,7 +584,7 @@ hws_action_destroy_stcs(struct mlx5hws_action *action) if (action->flags & MLX5HWS_ACTION_FLAG_HWS_FDB) mlx5hws_action_free_single_stc(ctx, MLX5HWS_TABLE_TYPE_FDB, - &action->stc[MLX5HWS_TABLE_TYPE_FDB]); + &action->stc); mutex_unlock(&ctx->ctrl_lock); } @@ -1639,8 +1638,8 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(MLX5HWS_TABLE_TYPE_FDB, false); /* STC is a single resource (obj_id), use any STC for the ID */ - stc_pool = ctx->stc_pool[MLX5HWS_TABLE_TYPE_FDB]; - default_stc = ctx->common_res[MLX5HWS_TABLE_TYPE_FDB].default_stc; + stc_pool = ctx->stc_pool; + default_stc = ctx->common_res.default_stc; obj_id = mlx5hws_pool_chunk_get_base_id(stc_pool, &default_stc->default_hit); rtc_attr.stc_base = obj_id; @@ -1731,7 +1730,7 @@ hws_action_create_dest_match_range_fill_table(struct mlx5hws_context *ctx, ste_attr.used_id_rtc_0 = &used_rtc_0_id; ste_attr.used_id_rtc_1 = &used_rtc_1_id; - common_res = &ctx->common_res[MLX5HWS_TABLE_TYPE_FDB]; + common_res = &ctx->common_res; /* init an empty match STE which will always hit */ ste_attr.wqe_ctrl = &wqe_ctrl; @@ -1750,7 +1749,7 @@ hws_action_create_dest_match_range_fill_table(struct mlx5hws_context *ctx, wqe_ctrl.stc_ix[MLX5HWS_ACTION_STC_IDX_CTRL] |= htonl(MLX5HWS_ACTION_STC_IDX_LAST_COMBO2 << 29); wqe_ctrl.stc_ix[MLX5HWS_ACTION_STC_IDX_HIT] = - htonl(hit_ft_action->stc[MLX5HWS_TABLE_TYPE_FDB].offset); + htonl(hit_ft_action->stc.offset); wqe_data_arr = (__force __be32 *)&range_wqe_data; @@ -1843,7 +1842,7 @@ mlx5hws_action_create_dest_match_range(struct mlx5hws_context *ctx, stc_attr.ste_table.match_definer_id = ctx->caps->trivial_match_definer; ret = mlx5hws_action_alloc_single_stc(ctx, &stc_attr, MLX5HWS_TABLE_TYPE_FDB, - &action->stc[MLX5HWS_TABLE_TYPE_FDB]); + &action->stc); if (ret) goto error_unlock; @@ -1875,7 +1874,50 @@ struct mlx5hws_action * mlx5hws_action_create_flow_sampler(struct mlx5hws_context *ctx, u32 sampler_id, u32 flags) { - mlx5hws_err(ctx, "Flow sampler action - unsupported\n"); + struct mlx5hws_cmd_ft_create_attr ft_attr = {0}; + struct mlx5hws_cmd_set_fte_attr fte_attr = {0}; + struct mlx5hws_cmd_forward_tbl *fw_island; + struct mlx5hws_cmd_set_fte_dest dest; + struct mlx5hws_action *action; + int ret; + + if (flags != (MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED)) { + mlx5hws_err(ctx, "Unsupported flags for flow sampler\n"); + return NULL; + } + + ft_attr.type = FS_FT_FDB; + ft_attr.level = ctx->caps->fdb_ft.max_level - 1; + + dest.destination_type = MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER; + dest.destination_id = sampler_id; + + fte_attr.dests_num = 1; + fte_attr.dests = &dest; + fte_attr.action_flags = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + fte_attr.ignore_flow_level = 1; + + fw_island = mlx5hws_cmd_forward_tbl_create(ctx->mdev, &ft_attr, &fte_attr); + if (!fw_island) + return NULL; + + action = hws_action_create_generic(ctx, flags, + MLX5HWS_ACTION_TYP_SAMPLER); + if (!action) + goto destroy_fw_island; + + ret = hws_action_create_stcs(action, fw_island->ft_id); + if (ret) + goto free_action; + + action->flow_sampler.fw_island = fw_island; + + return action; + +free_action: + kfree(action); +destroy_fw_island: + mlx5hws_cmd_forward_tbl_destroy(ctx->mdev, fw_island); return NULL; } @@ -1914,6 +1956,11 @@ static void hws_action_destroy_hws(struct mlx5hws_action *action) } kfree(action->dest_array.dest_list); break; + case MLX5HWS_ACTION_TYP_SAMPLER: + hws_action_destroy_stcs(action); + mlx5hws_cmd_forward_tbl_destroy(action->ctx->mdev, + action->flow_sampler.fw_island); + break; case MLX5HWS_ACTION_TYP_REFORMAT_TNL_L3_TO_L2: case MLX5HWS_ACTION_TYP_MODIFY_HDR: shared_arg = false; @@ -1970,8 +2017,8 @@ __must_hold(&ctx->ctrl_lock) struct mlx5hws_action_default_stc *default_stc; int ret; - if (ctx->common_res[tbl_type].default_stc) { - ctx->common_res[tbl_type].default_stc->refcount++; + if (ctx->common_res.default_stc) { + ctx->common_res.default_stc->refcount++; return 0; } @@ -2023,8 +2070,8 @@ __must_hold(&ctx->ctrl_lock) goto free_nop_dw7; } - ctx->common_res[tbl_type].default_stc = default_stc; - ctx->common_res[tbl_type].default_stc->refcount++; + ctx->common_res.default_stc = default_stc; + ctx->common_res.default_stc->refcount++; return 0; @@ -2046,9 +2093,7 @@ __must_hold(&ctx->ctrl_lock) { struct mlx5hws_action_default_stc *default_stc; - default_stc = ctx->common_res[tbl_type].default_stc; - - default_stc = ctx->common_res[tbl_type].default_stc; + default_stc = ctx->common_res.default_stc; if (--default_stc->refcount) return; @@ -2058,7 +2103,7 @@ __must_hold(&ctx->ctrl_lock) mlx5hws_action_free_single_stc(ctx, tbl_type, &default_stc->nop_dw5); mlx5hws_action_free_single_stc(ctx, tbl_type, &default_stc->nop_ctr); kfree(default_stc); - ctx->common_res[tbl_type].default_stc = NULL; + ctx->common_res.default_stc = NULL; } static void hws_action_modify_write(struct mlx5hws_send_engine *queue, @@ -2150,8 +2195,7 @@ hws_action_apply_stc(struct mlx5hws_actions_apply_data *apply, { struct mlx5hws_action *action = apply->rule_action[action_idx].action; - apply->wqe_ctrl->stc_ix[stc_idx] = - htonl(action->stc[apply->tbl_type].offset); + apply->wqe_ctrl->stc_ix[stc_idx] = htonl(action->stc.offset); } static void @@ -2181,7 +2225,7 @@ hws_action_setter_modify_header(struct mlx5hws_actions_apply_data *apply, rule_action = &apply->rule_action[setter->idx_double]; action = rule_action->action; - stc_idx = htonl(action->stc[apply->tbl_type].offset); + stc_idx = htonl(action->stc.offset); apply->wqe_ctrl->stc_ix[MLX5HWS_ACTION_STC_IDX_DW6] = stc_idx; apply->wqe_ctrl->stc_ix[MLX5HWS_ACTION_STC_IDX_DW7] = 0; @@ -2240,7 +2284,7 @@ hws_action_setter_insert_ptr(struct mlx5hws_actions_apply_data *apply, apply->wqe_data[MLX5HWS_ACTION_OFFSET_DW6] = 0; apply->wqe_data[MLX5HWS_ACTION_OFFSET_DW7] = htonl(arg_idx); - stc_idx = htonl(action->stc[apply->tbl_type].offset); + stc_idx = htonl(action->stc.offset); apply->wqe_ctrl->stc_ix[MLX5HWS_ACTION_STC_IDX_DW6] = stc_idx; apply->wqe_ctrl->stc_ix[MLX5HWS_ACTION_STC_IDX_DW7] = 0; @@ -2272,7 +2316,7 @@ hws_action_setter_tnl_l3_to_l2(struct mlx5hws_actions_apply_data *apply, apply->wqe_data[MLX5HWS_ACTION_OFFSET_DW6] = 0; apply->wqe_data[MLX5HWS_ACTION_OFFSET_DW7] = htonl(arg_idx); - stc_idx = htonl(action->stc[apply->tbl_type].offset); + stc_idx = htonl(action->stc.offset); apply->wqe_ctrl->stc_ix[MLX5HWS_ACTION_STC_IDX_DW6] = stc_idx; apply->wqe_ctrl->stc_ix[MLX5HWS_ACTION_STC_IDX_DW7] = 0; @@ -2434,6 +2478,7 @@ int mlx5hws_action_template_process(struct mlx5hws_action_template *at) case MLX5HWS_ACTION_TYP_DROP: case MLX5HWS_ACTION_TYP_TBL: case MLX5HWS_ACTION_TYP_DEST_ARRAY: + case MLX5HWS_ACTION_TYP_SAMPLER: case MLX5HWS_ACTION_TYP_VPORT: case MLX5HWS_ACTION_TYP_MISS: /* Hit action */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h index e8f562c31826..64b76075f7f8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h @@ -70,12 +70,12 @@ struct mlx5hws_action_default_stc { struct mlx5hws_pool_chunk nop_dw6; struct mlx5hws_pool_chunk nop_dw7; struct mlx5hws_pool_chunk default_hit; - u32 refcount; + u32 refcount; /* protected by context ctrl lock */ }; struct mlx5hws_action_shared_stc { struct mlx5hws_pool_chunk stc_chunk; - u32 refcount; + u32 refcount; /* protected by context ctrl lock */ }; struct mlx5hws_actions_apply_data { @@ -124,7 +124,7 @@ struct mlx5hws_action { struct mlx5hws_context *ctx; union { struct { - struct mlx5hws_pool_chunk stc[MLX5HWS_TABLE_TYPE_MAX]; + struct mlx5hws_pool_chunk stc; union { struct { u32 pat_id; @@ -166,6 +166,9 @@ struct mlx5hws_action { struct mlx5hws_cmd_set_fte_dest *dest_list; } dest_array; struct { + struct mlx5hws_cmd_forward_tbl *fw_island; + } flow_sampler; + struct { u8 type; u8 start_anchor; u8 end_anchor; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c index baacf662c0ab..a8d886e92144 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c @@ -152,6 +152,8 @@ mlx5hws_bwc_matcher_create(struct mlx5hws_table *table, if (!bwc_matcher) return NULL; + atomic_set(&bwc_matcher->num_of_rules, 0); + /* Check if the required match params can be all matched * in single STE, otherwise complex matcher is needed. */ @@ -199,10 +201,12 @@ int mlx5hws_bwc_matcher_destroy_simple(struct mlx5hws_bwc_matcher *bwc_matcher) int mlx5hws_bwc_matcher_destroy(struct mlx5hws_bwc_matcher *bwc_matcher) { - if (bwc_matcher->num_of_rules) + u32 num_of_rules = atomic_read(&bwc_matcher->num_of_rules); + + if (num_of_rules) mlx5hws_err(bwc_matcher->matcher->tbl->ctx, "BWC matcher destroy: matcher still has %d rules\n", - bwc_matcher->num_of_rules); + num_of_rules); mlx5hws_bwc_matcher_destroy_simple(bwc_matcher); @@ -215,6 +219,8 @@ static int hws_bwc_queue_poll(struct mlx5hws_context *ctx, u32 *pending_rules, bool drain) { + unsigned long timeout = jiffies + + msecs_to_jiffies(MLX5HWS_BWC_POLLING_TIMEOUT * MSEC_PER_SEC); struct mlx5hws_flow_op_result comp[MLX5HWS_BWC_MATCHER_REHASH_BURST_TH]; u16 burst_th = hws_bwc_get_burst_th(ctx, queue_id); bool got_comp = *pending_rules >= burst_th; @@ -250,6 +256,11 @@ static int hws_bwc_queue_poll(struct mlx5hws_context *ctx, } got_comp = !!ret; + + if (unlikely(!got_comp && time_after(jiffies, timeout))) { + mlx5hws_err(ctx, "BWC poll error: polling queue %d - TIMEOUT\n", queue_id); + return -ETIMEDOUT; + } } return err; @@ -309,7 +320,7 @@ static void hws_bwc_rule_list_add(struct mlx5hws_bwc_rule *bwc_rule, u16 idx) { struct mlx5hws_bwc_matcher *bwc_matcher = bwc_rule->bwc_matcher; - bwc_matcher->num_of_rules++; + atomic_inc(&bwc_matcher->num_of_rules); bwc_rule->bwc_queue_idx = idx; list_add(&bwc_rule->list_node, &bwc_matcher->rules[idx]); } @@ -318,7 +329,7 @@ static void hws_bwc_rule_list_remove(struct mlx5hws_bwc_rule *bwc_rule) { struct mlx5hws_bwc_matcher *bwc_matcher = bwc_rule->bwc_matcher; - bwc_matcher->num_of_rules--; + atomic_dec(&bwc_matcher->num_of_rules); list_del_init(&bwc_rule->list_node); } @@ -334,22 +345,21 @@ hws_bwc_rule_destroy_hws_sync(struct mlx5hws_bwc_rule *bwc_rule, struct mlx5hws_rule_attr *rule_attr) { struct mlx5hws_context *ctx = bwc_rule->bwc_matcher->matcher->tbl->ctx; - struct mlx5hws_flow_op_result completion; + u32 expected_completions = 1; int ret; ret = hws_bwc_rule_destroy_hws_async(bwc_rule, rule_attr); if (unlikely(ret)) return ret; - do { - ret = mlx5hws_send_queue_poll(ctx, rule_attr->queue_id, &completion, 1); - } while (ret != 1); - - if (unlikely(completion.status != MLX5HWS_FLOW_OP_SUCCESS || - (bwc_rule->rule->status != MLX5HWS_RULE_STATUS_DELETED && - bwc_rule->rule->status != MLX5HWS_RULE_STATUS_DELETING))) { - mlx5hws_err(ctx, "Failed destroying BWC rule: completion %d, rule status %d\n", - completion.status, bwc_rule->rule->status); + ret = hws_bwc_queue_poll(ctx, rule_attr->queue_id, &expected_completions, true); + if (unlikely(ret)) + return ret; + + if (unlikely(bwc_rule->rule->status != MLX5HWS_RULE_STATUS_DELETED && + bwc_rule->rule->status != MLX5HWS_RULE_STATUS_DELETING)) { + mlx5hws_err(ctx, "Failed destroying BWC rule: rule status %d\n", + bwc_rule->rule->status); return -EINVAL; } @@ -615,8 +625,12 @@ static int hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_match ret = hws_bwc_queue_poll(ctx, rule_attr.queue_id, &pending_rules[i], false); - if (unlikely(ret)) + if (unlikely(ret)) { + mlx5hws_err(ctx, + "Moving BWC rule failed during rehash (%d)\n", + ret); goto free_bwc_rules; + } } } } while (!all_done); @@ -629,8 +643,11 @@ static int hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_match mlx5hws_send_engine_flush_queue(&ctx->send_queue[queue_id]); ret = hws_bwc_queue_poll(ctx, queue_id, &pending_rules[i], true); - if (unlikely(ret)) + if (unlikely(ret)) { + mlx5hws_err(ctx, + "Moving BWC rule failed during rehash (%d)\n", ret); goto free_bwc_rules; + } } } @@ -704,7 +721,8 @@ hws_bwc_matcher_rehash_size(struct mlx5hws_bwc_matcher *bwc_matcher) * Need to check again if we really need rehash. * If the reason for rehash was size, but not any more - skip rehash. */ - if (!hws_bwc_matcher_rehash_size_needed(bwc_matcher, bwc_matcher->num_of_rules)) + if (!hws_bwc_matcher_rehash_size_needed(bwc_matcher, + atomic_read(&bwc_matcher->num_of_rules))) return 0; /* Now we're done all the checking - do the rehash: @@ -797,7 +815,7 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule, } /* check if number of rules require rehash */ - num_of_rules = bwc_matcher->num_of_rules; + num_of_rules = atomic_read(&bwc_matcher->num_of_rules); if (unlikely(hws_bwc_matcher_rehash_size_needed(bwc_matcher, num_of_rules))) { mutex_unlock(queue_lock); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h index 3d4965213b01..f9f569131dde 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h @@ -8,10 +8,18 @@ #define MLX5HWS_BWC_MATCHER_SIZE_LOG_STEP 1 #define MLX5HWS_BWC_MATCHER_REHASH_PERCENT_TH 70 #define MLX5HWS_BWC_MATCHER_REHASH_BURST_TH 32 -#define MLX5HWS_BWC_MATCHER_ATTACH_AT_NUM 255 + +/* Max number of AT attach operations for the same matcher. + * When the limit is reached, next attempt to attach new AT + * will result in creation of a new matcher and moving all + * the rules to this matcher. + */ +#define MLX5HWS_BWC_MATCHER_ATTACH_AT_NUM 8 #define MLX5HWS_BWC_MAX_ACTS 16 +#define MLX5HWS_BWC_POLLING_TIMEOUT 60 + struct mlx5hws_bwc_matcher { struct mlx5hws_matcher *matcher; struct mlx5hws_match_template *mt; @@ -19,7 +27,7 @@ struct mlx5hws_bwc_matcher { u8 num_of_at; u16 priority; u8 size_log; - u32 num_of_rules; /* atomically accessed */ + atomic_t num_of_rules; struct list_head *rules; }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c index c00c138c3366..487e75476b0a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c @@ -257,6 +257,12 @@ int mlx5hws_cmd_set_fte(struct mlx5_core_dev *mdev, dest->ext_reformat_id); } break; + case MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER: + MLX5_SET(dest_format, in_dests, + destination_type, ifc_dest_type); + MLX5_SET(dest_format, in_dests, destination_id, + dest->destination_id); + break; default: ret = -EOPNOTSUPP; goto out; @@ -359,7 +365,7 @@ void mlx5hws_cmd_set_attr_connect_miss_tbl(struct mlx5hws_context *ctx, ft_attr->type = fw_ft_type; ft_attr->table_miss_action = MLX5_IFC_MODIFY_FLOW_TABLE_MISS_ACTION_GOTO_TBL; - default_miss_tbl = ctx->common_res[type].default_miss->ft_id; + default_miss_tbl = ctx->common_res.default_miss->ft_id; if (!default_miss_tbl) { pr_warn("HWS: no flow table ID for default miss\n"); return; @@ -622,12 +628,12 @@ int mlx5hws_cmd_arg_create(struct mlx5_core_dev *mdev, u32 pd, u32 *arg_id) { + u32 in[MLX5_ST_SZ_DW(create_modify_header_arg_in)] = {0}; u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {0}; - u32 in[MLX5_ST_SZ_DW(create_arg_in)] = {0}; void *attr; int ret; - attr = MLX5_ADDR_OF(create_arg_in, in, hdr); + attr = MLX5_ADDR_OF(create_modify_header_arg_in, in, hdr); MLX5_SET(general_obj_in_cmd_hdr, attr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); MLX5_SET(general_obj_in_cmd_hdr, @@ -635,8 +641,8 @@ int mlx5hws_cmd_arg_create(struct mlx5_core_dev *mdev, MLX5_SET(general_obj_in_cmd_hdr, attr, op_param.create.log_obj_range, log_obj_range); - attr = MLX5_ADDR_OF(create_arg_in, in, arg); - MLX5_SET(arg, attr, access_pd, pd); + attr = MLX5_ADDR_OF(create_modify_header_arg_in, in, arg); + MLX5_SET(modify_header_arg, attr, access_pd, pd); ret = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); if (ret) { @@ -812,7 +818,7 @@ int mlx5hws_cmd_packet_reformat_create(struct mlx5_core_dev *mdev, struct mlx5hws_cmd_packet_reformat_create_attr *attr, u32 *reformat_id) { - u32 out[MLX5_ST_SZ_DW(alloc_packet_reformat_out)] = {0}; + u32 out[MLX5_ST_SZ_DW(alloc_packet_reformat_context_out)] = {0}; size_t insz, cmd_data_sz, cmd_total_sz; void *prctx; void *pdata; @@ -845,7 +851,7 @@ int mlx5hws_cmd_packet_reformat_create(struct mlx5_core_dev *mdev, goto out; } - *reformat_id = MLX5_GET(alloc_packet_reformat_out, out, packet_reformat_id); + *reformat_id = MLX5_GET(alloc_packet_reformat_context_out, out, packet_reformat_id); out: kfree(in); return ret; @@ -854,13 +860,13 @@ out: int mlx5hws_cmd_packet_reformat_destroy(struct mlx5_core_dev *mdev, u32 reformat_id) { - u32 out[MLX5_ST_SZ_DW(dealloc_packet_reformat_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(dealloc_packet_reformat_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(dealloc_packet_reformat_context_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(dealloc_packet_reformat_context_in)] = {0}; int ret; - MLX5_SET(dealloc_packet_reformat_in, in, opcode, + MLX5_SET(dealloc_packet_reformat_context_in, in, opcode, MLX5_CMD_OP_DEALLOC_PACKET_REFORMAT_CONTEXT); - MLX5_SET(dealloc_packet_reformat_in, in, + MLX5_SET(dealloc_packet_reformat_context_in, in, packet_reformat_id, reformat_id); ret = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); @@ -889,73 +895,6 @@ int mlx5hws_cmd_sq_modify_rdy(struct mlx5_core_dev *mdev, u32 sqn) return ret; } -int mlx5hws_cmd_allow_other_vhca_access(struct mlx5_core_dev *mdev, - struct mlx5hws_cmd_allow_other_vhca_access_attr *attr) -{ - u32 out[MLX5_ST_SZ_DW(allow_other_vhca_access_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(allow_other_vhca_access_in)] = {0}; - void *key; - int ret; - - MLX5_SET(allow_other_vhca_access_in, - in, opcode, MLX5_CMD_OP_ALLOW_OTHER_VHCA_ACCESS); - MLX5_SET(allow_other_vhca_access_in, - in, object_type_to_be_accessed, attr->obj_type); - MLX5_SET(allow_other_vhca_access_in, - in, object_id_to_be_accessed, attr->obj_id); - - key = MLX5_ADDR_OF(allow_other_vhca_access_in, in, access_key); - memcpy(key, attr->access_key, sizeof(attr->access_key)); - - ret = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); - if (ret) - mlx5_core_err(mdev, "Failed to execute ALLOW_OTHER_VHCA_ACCESS command\n"); - - return ret; -} - -int mlx5hws_cmd_alias_obj_create(struct mlx5_core_dev *mdev, - struct mlx5hws_cmd_alias_obj_create_attr *alias_attr, - u32 *obj_id) -{ - u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {0}; - u32 in[MLX5_ST_SZ_DW(create_alias_obj_in)] = {0}; - void *attr; - void *key; - int ret; - - attr = MLX5_ADDR_OF(create_alias_obj_in, in, hdr); - MLX5_SET(general_obj_in_cmd_hdr, - attr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); - MLX5_SET(general_obj_in_cmd_hdr, - attr, obj_type, alias_attr->obj_type); - MLX5_SET(general_obj_in_cmd_hdr, attr, op_param.create.alias_object, 1); - - attr = MLX5_ADDR_OF(create_alias_obj_in, in, alias_ctx); - MLX5_SET(alias_context, attr, vhca_id_to_be_accessed, alias_attr->vhca_id); - MLX5_SET(alias_context, attr, object_id_to_be_accessed, alias_attr->obj_id); - - key = MLX5_ADDR_OF(alias_context, attr, access_key); - memcpy(key, alias_attr->access_key, sizeof(alias_attr->access_key)); - - ret = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); - if (ret) { - mlx5_core_err(mdev, "Failed to create ALIAS OBJ\n"); - goto out; - } - - *obj_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); -out: - return ret; -} - -int mlx5hws_cmd_alias_obj_destroy(struct mlx5_core_dev *mdev, - u16 obj_type, - u32 obj_id) -{ - return hws_cmd_general_obj_destroy(mdev, obj_type, obj_id); -} - int mlx5hws_cmd_generate_wqe(struct mlx5_core_dev *mdev, struct mlx5hws_cmd_generate_wqe_attr *attr, struct mlx5_cqe64 *ret_cqe) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h index 434f62b0904e..610c63d81ad9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h @@ -63,7 +63,7 @@ struct mlx5hws_cmd_forward_tbl { u8 type; u32 ft_id; u32 fg_id; - u32 refcount; + u32 refcount; /* protected by context ctrl lock */ }; struct mlx5hws_cmd_rtc_create_attr { @@ -334,14 +334,6 @@ mlx5hws_cmd_forward_tbl_create(struct mlx5_core_dev *mdev, void mlx5hws_cmd_forward_tbl_destroy(struct mlx5_core_dev *mdev, struct mlx5hws_cmd_forward_tbl *tbl); -int mlx5hws_cmd_alias_obj_create(struct mlx5_core_dev *mdev, - struct mlx5hws_cmd_alias_obj_create_attr *alias_attr, - u32 *obj_id); - -int mlx5hws_cmd_alias_obj_destroy(struct mlx5_core_dev *mdev, - u16 obj_type, - u32 obj_id); - int mlx5hws_cmd_sq_modify_rdy(struct mlx5_core_dev *mdev, u32 sqn); int mlx5hws_cmd_query_caps(struct mlx5_core_dev *mdev, @@ -352,9 +344,6 @@ void mlx5hws_cmd_set_attr_connect_miss_tbl(struct mlx5hws_context *ctx, enum mlx5hws_table_type type, struct mlx5hws_cmd_ft_modify_attr *ft_attr); -int mlx5hws_cmd_allow_other_vhca_access(struct mlx5_core_dev *mdev, - struct mlx5hws_cmd_allow_other_vhca_access_attr *attr); - int mlx5hws_cmd_query_gvmi(struct mlx5_core_dev *mdev, bool other_function, u16 vport_number, u16 *gvmi); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c index 4a8928f33bb9..9cda2774fd64 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c @@ -23,7 +23,6 @@ static int hws_context_pools_init(struct mlx5hws_context *ctx) struct mlx5hws_pool_attr pool_attr = {0}; u8 max_log_sz; int ret; - int i; ret = mlx5hws_pat_init_pattern_cache(&ctx->pattern_cache); if (ret) @@ -39,23 +38,17 @@ static int hws_context_pools_init(struct mlx5hws_context *ctx) max_log_sz = min(MLX5HWS_POOL_STC_LOG_SZ, ctx->caps->stc_alloc_log_max); pool_attr.alloc_log_sz = max(max_log_sz, ctx->caps->stc_alloc_log_gran); - for (i = 0; i < MLX5HWS_TABLE_TYPE_MAX; i++) { - pool_attr.table_type = i; - ctx->stc_pool[i] = mlx5hws_pool_create(ctx, &pool_attr); - if (!ctx->stc_pool[i]) { - mlx5hws_err(ctx, "Failed to allocate STC pool [%d]", i); - ret = -ENOMEM; - goto free_stc_pools; - } + pool_attr.table_type = MLX5HWS_TABLE_TYPE_FDB; + ctx->stc_pool = mlx5hws_pool_create(ctx, &pool_attr); + if (!ctx->stc_pool) { + mlx5hws_err(ctx, "Failed to allocate STC pool\n"); + ret = -ENOMEM; + goto uninit_cache; } return 0; -free_stc_pools: - for (i = 0; i < MLX5HWS_TABLE_TYPE_MAX; i++) - if (ctx->stc_pool[i]) - mlx5hws_pool_destroy(ctx->stc_pool[i]); - +uninit_cache: mlx5hws_definer_uninit_cache(ctx->definer_cache); uninit_pat_cache: mlx5hws_pat_uninit_pattern_cache(ctx->pattern_cache); @@ -64,12 +57,8 @@ uninit_pat_cache: static void hws_context_pools_uninit(struct mlx5hws_context *ctx) { - int i; - - for (i = 0; i < MLX5HWS_TABLE_TYPE_MAX; i++) { - if (ctx->stc_pool[i]) - mlx5hws_pool_destroy(ctx->stc_pool[i]); - } + if (ctx->stc_pool) + mlx5hws_pool_destroy(ctx->stc_pool); mlx5hws_definer_uninit_cache(ctx->definer_cache); mlx5hws_pat_uninit_pattern_cache(ctx->pattern_cache); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h index 1c9cc4fba083..38c3647444ad 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h @@ -38,8 +38,8 @@ struct mlx5hws_context { struct mlx5_core_dev *mdev; struct mlx5hws_cmd_query_caps *caps; u32 pd_num; - struct mlx5hws_pool *stc_pool[MLX5HWS_TABLE_TYPE_MAX]; - struct mlx5hws_context_common_res common_res[MLX5HWS_TABLE_TYPE_MAX]; + struct mlx5hws_pool *stc_pool; + struct mlx5hws_context_common_res common_res; struct mlx5hws_pattern_cache *pattern_cache; struct mlx5hws_definer_cache *definer_cache; struct mutex ctrl_lock; /* control lock to protect the whole context */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c index 5b200b4bc1a8..696275fd0ce2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c @@ -148,8 +148,8 @@ static int hws_debug_dump_matcher(struct seq_file *f, struct mlx5hws_matcher *ma matcher->match_ste.rtc_1_id, (int)ste_1_id); - ste = &matcher->action_ste[0].ste; - ste_pool = matcher->action_ste[0].pool; + ste = &matcher->action_ste.ste; + ste_pool = matcher->action_ste.pool; if (ste_pool) { ste_0_id = mlx5hws_pool_chunk_get_base_id(ste_pool, ste); if (tbl_type == MLX5HWS_TABLE_TYPE_FDB) @@ -171,10 +171,8 @@ static int hws_debug_dump_matcher(struct seq_file *f, struct mlx5hws_matcher *ma return ret; seq_printf(f, ",%d,%d,%d,%d,%d,0x%llx,0x%llx\n", - matcher->action_ste[0].rtc_0_id, - (int)ste_0_id, - matcher->action_ste[0].rtc_1_id, - (int)ste_1_id, + matcher->action_ste.rtc_0_id, (int)ste_0_id, + matcher->action_ste.rtc_1_id, (int)ste_1_id, 0, mlx5hws_debug_icm_to_idx(icm_addr_0), mlx5hws_debug_icm_to_idx(icm_addr_1)); @@ -368,9 +366,10 @@ static int hws_debug_dump_context_info(struct seq_file *f, struct mlx5hws_contex static int hws_debug_dump_context_stc_resource(struct seq_file *f, struct mlx5hws_context *ctx, - u32 tbl_type, struct mlx5hws_pool_resource *resource) { + u32 tbl_type = MLX5HWS_TABLE_TYPE_BASE + MLX5HWS_TABLE_TYPE_FDB; + seq_printf(f, "%d,0x%llx,%u,%u\n", MLX5HWS_DEBUG_RES_TYPE_CONTEXT_STC, HWS_PTR_TO_ID(ctx), @@ -382,31 +381,22 @@ static int hws_debug_dump_context_stc_resource(struct seq_file *f, static int hws_debug_dump_context_stc(struct seq_file *f, struct mlx5hws_context *ctx) { - struct mlx5hws_pool *stc_pool; - u32 table_type; + struct mlx5hws_pool *stc_pool = ctx->stc_pool; int ret; - int i; - for (i = 0; i < MLX5HWS_TABLE_TYPE_MAX; i++) { - stc_pool = ctx->stc_pool[i]; - table_type = MLX5HWS_TABLE_TYPE_BASE + i; - - if (!stc_pool) - continue; + if (!stc_pool) + return 0; - if (stc_pool->resource[0]) { - ret = hws_debug_dump_context_stc_resource(f, ctx, table_type, - stc_pool->resource[0]); - if (ret) - return ret; - } + if (stc_pool->resource[0]) { + ret = hws_debug_dump_context_stc_resource(f, ctx, stc_pool->resource[0]); + if (ret) + return ret; + } - if (i == MLX5HWS_TABLE_TYPE_FDB && stc_pool->mirror_resource[0]) { - ret = hws_debug_dump_context_stc_resource(f, ctx, table_type, - stc_pool->mirror_resource[0]); - if (ret) - return ret; - } + if (stc_pool->mirror_resource[0]) { + ret = hws_debug_dump_context_stc_resource(f, ctx, stc_pool->mirror_resource[0]); + if (ret) + return ret; } return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c index 8fe96eb76baf..10ece7df1cfa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c @@ -70,7 +70,7 @@ u32 second_dw_mask = (mask) & ((1 << _bit_off) - 1); \ _HWS_SET32(p, (v) >> _bit_off, byte_off, 0, (mask) >> _bit_off); \ _HWS_SET32(p, (v) & second_dw_mask, (byte_off) + DW_SIZE, \ - (bit_off) % BITS_IN_DW, second_dw_mask); \ + (bit_off + BITS_IN_DW) % BITS_IN_DW, second_dw_mask); \ } else { \ _HWS_SET32(p, v, byte_off, (bit_off), (mask)); \ } \ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.h index 9432d5084def..5c1a2086efba 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.h @@ -785,7 +785,7 @@ struct mlx5hws_definer_cache { struct mlx5hws_definer_cache_item { struct mlx5hws_definer definer; - u32 refcount; + u32 refcount; /* protected by context ctrl lock */ struct list_head list_node; }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c new file mode 100644 index 000000000000..05329afeb9ea --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c @@ -0,0 +1,1377 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2025 NVIDIA Corporation & Affiliates */ + +#include <linux/mlx5/vport.h> +#include <mlx5_core.h> +#include <fs_core.h> +#include <fs_cmd.h> +#include "fs_hws_pools.h" +#include "mlx5hws.h" + +#define MLX5HWS_CTX_MAX_NUM_OF_QUEUES 16 +#define MLX5HWS_CTX_QUEUE_SIZE 256 + +static struct mlx5hws_action * +mlx5_fs_create_action_remove_header_vlan(struct mlx5hws_context *ctx); +static void +mlx5_fs_destroy_pr_pool(struct mlx5_fs_pool *pool, struct xarray *pr_pools, + unsigned long index); +static void +mlx5_fs_destroy_mh_pool(struct mlx5_fs_pool *pool, struct xarray *mh_pools, + unsigned long index); + +static int mlx5_fs_init_hws_actions_pool(struct mlx5_core_dev *dev, + struct mlx5_fs_hws_context *fs_ctx) +{ + u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED; + struct mlx5_fs_hws_actions_pool *hws_pool = &fs_ctx->hws_pool; + struct mlx5hws_action_reformat_header reformat_hdr = {}; + struct mlx5hws_context *ctx = fs_ctx->hws_ctx; + enum mlx5hws_action_type action_type; + int err = -ENOSPC; + + hws_pool->tag_action = mlx5hws_action_create_tag(ctx, flags); + if (!hws_pool->tag_action) + return err; + hws_pool->pop_vlan_action = mlx5hws_action_create_pop_vlan(ctx, flags); + if (!hws_pool->pop_vlan_action) + goto destroy_tag; + hws_pool->push_vlan_action = mlx5hws_action_create_push_vlan(ctx, flags); + if (!hws_pool->push_vlan_action) + goto destroy_pop_vlan; + hws_pool->drop_action = mlx5hws_action_create_dest_drop(ctx, flags); + if (!hws_pool->drop_action) + goto destroy_push_vlan; + action_type = MLX5HWS_ACTION_TYP_REFORMAT_TNL_L2_TO_L2; + hws_pool->decapl2_action = + mlx5hws_action_create_reformat(ctx, action_type, 1, + &reformat_hdr, 0, flags); + if (!hws_pool->decapl2_action) + goto destroy_drop; + hws_pool->remove_hdr_vlan_action = + mlx5_fs_create_action_remove_header_vlan(ctx); + if (!hws_pool->remove_hdr_vlan_action) + goto destroy_decapl2; + err = mlx5_fs_hws_pr_pool_init(&hws_pool->insert_hdr_pool, dev, 0, + MLX5HWS_ACTION_TYP_INSERT_HEADER); + if (err) + goto destroy_remove_hdr; + err = mlx5_fs_hws_pr_pool_init(&hws_pool->dl3tnltol2_pool, dev, 0, + MLX5HWS_ACTION_TYP_REFORMAT_TNL_L3_TO_L2); + if (err) + goto cleanup_insert_hdr; + xa_init(&hws_pool->el2tol3tnl_pools); + xa_init(&hws_pool->el2tol2tnl_pools); + xa_init(&hws_pool->mh_pools); + xa_init(&hws_pool->table_dests); + xa_init(&hws_pool->vport_dests); + xa_init(&hws_pool->vport_vhca_dests); + return 0; + +cleanup_insert_hdr: + mlx5_fs_hws_pr_pool_cleanup(&hws_pool->insert_hdr_pool); +destroy_remove_hdr: + mlx5hws_action_destroy(hws_pool->remove_hdr_vlan_action); +destroy_decapl2: + mlx5hws_action_destroy(hws_pool->decapl2_action); +destroy_drop: + mlx5hws_action_destroy(hws_pool->drop_action); +destroy_push_vlan: + mlx5hws_action_destroy(hws_pool->push_vlan_action); +destroy_pop_vlan: + mlx5hws_action_destroy(hws_pool->pop_vlan_action); +destroy_tag: + mlx5hws_action_destroy(hws_pool->tag_action); + return err; +} + +static void mlx5_fs_cleanup_hws_actions_pool(struct mlx5_fs_hws_context *fs_ctx) +{ + struct mlx5_fs_hws_actions_pool *hws_pool = &fs_ctx->hws_pool; + struct mlx5hws_action *action; + struct mlx5_fs_pool *pool; + unsigned long i; + + xa_for_each(&hws_pool->vport_vhca_dests, i, action) + mlx5hws_action_destroy(action); + xa_destroy(&hws_pool->vport_vhca_dests); + xa_for_each(&hws_pool->vport_dests, i, action) + mlx5hws_action_destroy(action); + xa_destroy(&hws_pool->vport_dests); + xa_destroy(&hws_pool->table_dests); + xa_for_each(&hws_pool->mh_pools, i, pool) + mlx5_fs_destroy_mh_pool(pool, &hws_pool->mh_pools, i); + xa_destroy(&hws_pool->mh_pools); + xa_for_each(&hws_pool->el2tol2tnl_pools, i, pool) + mlx5_fs_destroy_pr_pool(pool, &hws_pool->el2tol2tnl_pools, i); + xa_destroy(&hws_pool->el2tol2tnl_pools); + xa_for_each(&hws_pool->el2tol3tnl_pools, i, pool) + mlx5_fs_destroy_pr_pool(pool, &hws_pool->el2tol3tnl_pools, i); + xa_destroy(&hws_pool->el2tol3tnl_pools); + mlx5_fs_hws_pr_pool_cleanup(&hws_pool->dl3tnltol2_pool); + mlx5_fs_hws_pr_pool_cleanup(&hws_pool->insert_hdr_pool); + mlx5hws_action_destroy(hws_pool->remove_hdr_vlan_action); + mlx5hws_action_destroy(hws_pool->decapl2_action); + mlx5hws_action_destroy(hws_pool->drop_action); + mlx5hws_action_destroy(hws_pool->push_vlan_action); + mlx5hws_action_destroy(hws_pool->pop_vlan_action); + mlx5hws_action_destroy(hws_pool->tag_action); +} + +static int mlx5_cmd_hws_create_ns(struct mlx5_flow_root_namespace *ns) +{ + struct mlx5hws_context_attr hws_ctx_attr = {}; + int err; + + hws_ctx_attr.queues = min_t(int, num_online_cpus(), + MLX5HWS_CTX_MAX_NUM_OF_QUEUES); + hws_ctx_attr.queue_size = MLX5HWS_CTX_QUEUE_SIZE; + + ns->fs_hws_context.hws_ctx = + mlx5hws_context_open(ns->dev, &hws_ctx_attr); + if (!ns->fs_hws_context.hws_ctx) { + mlx5_core_err(ns->dev, "Failed to create hws flow namespace\n"); + return -EINVAL; + } + err = mlx5_fs_init_hws_actions_pool(ns->dev, &ns->fs_hws_context); + if (err) { + mlx5_core_err(ns->dev, "Failed to init hws actions pool\n"); + mlx5hws_context_close(ns->fs_hws_context.hws_ctx); + return err; + } + return 0; +} + +static int mlx5_cmd_hws_destroy_ns(struct mlx5_flow_root_namespace *ns) +{ + mlx5_fs_cleanup_hws_actions_pool(&ns->fs_hws_context); + return mlx5hws_context_close(ns->fs_hws_context.hws_ctx); +} + +static int mlx5_cmd_hws_set_peer(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_root_namespace *peer_ns, + u16 peer_vhca_id) +{ + struct mlx5hws_context *peer_ctx = NULL; + + if (peer_ns) + peer_ctx = peer_ns->fs_hws_context.hws_ctx; + mlx5hws_context_set_peer(ns->fs_hws_context.hws_ctx, peer_ctx, + peer_vhca_id); + return 0; +} + +static int mlx5_fs_set_ft_default_miss(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_table *next_ft) +{ + struct mlx5hws_table *next_tbl; + int err; + + if (!ns->fs_hws_context.hws_ctx) + return -EINVAL; + + /* if no change required, return */ + if (!next_ft && !ft->fs_hws_table.miss_ft_set) + return 0; + + next_tbl = next_ft ? next_ft->fs_hws_table.hws_table : NULL; + err = mlx5hws_table_set_default_miss(ft->fs_hws_table.hws_table, next_tbl); + if (err) { + mlx5_core_err(ns->dev, "Failed setting FT default miss (%d)\n", err); + return err; + } + ft->fs_hws_table.miss_ft_set = !!next_tbl; + return 0; +} + +static int mlx5_fs_add_flow_table_dest_action(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft) +{ + u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED; + struct mlx5_fs_hws_context *fs_ctx = &ns->fs_hws_context; + struct mlx5hws_action *dest_ft_action; + struct xarray *dests_xa; + int err; + + dest_ft_action = mlx5hws_action_create_dest_table_num(fs_ctx->hws_ctx, + ft->id, flags); + if (!dest_ft_action) { + mlx5_core_err(ns->dev, "Failed creating dest table action\n"); + return -ENOMEM; + } + + dests_xa = &fs_ctx->hws_pool.table_dests; + err = xa_insert(dests_xa, ft->id, dest_ft_action, GFP_KERNEL); + if (err) + mlx5hws_action_destroy(dest_ft_action); + return err; +} + +static int mlx5_fs_del_flow_table_dest_action(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft) +{ + struct mlx5_fs_hws_context *fs_ctx = &ns->fs_hws_context; + struct mlx5hws_action *dest_ft_action; + struct xarray *dests_xa; + int err; + + dests_xa = &fs_ctx->hws_pool.table_dests; + dest_ft_action = xa_erase(dests_xa, ft->id); + if (!dest_ft_action) { + mlx5_core_err(ns->dev, "Failed to erase dest ft action\n"); + return -ENOENT; + } + + err = mlx5hws_action_destroy(dest_ft_action); + if (err) + mlx5_core_err(ns->dev, "Failed to destroy dest ft action\n"); + return err; +} + +static int mlx5_cmd_hws_create_flow_table(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_table_attr *ft_attr, + struct mlx5_flow_table *next_ft) +{ + struct mlx5hws_context *ctx = ns->fs_hws_context.hws_ctx; + struct mlx5hws_table_attr tbl_attr = {}; + struct mlx5hws_table *tbl; + int err; + + if (mlx5_fs_cmd_is_fw_term_table(ft)) { + err = mlx5_fs_cmd_get_fw_cmds()->create_flow_table(ns, ft, ft_attr, + next_ft); + if (err) + return err; + err = mlx5_fs_add_flow_table_dest_action(ns, ft); + if (err) + mlx5_fs_cmd_get_fw_cmds()->destroy_flow_table(ns, ft); + return err; + } + + if (ns->table_type != FS_FT_FDB) { + mlx5_core_err(ns->dev, "Table type %d not supported for HWS\n", + ns->table_type); + return -EOPNOTSUPP; + } + + tbl_attr.type = MLX5HWS_TABLE_TYPE_FDB; + tbl_attr.level = ft_attr->level; + tbl = mlx5hws_table_create(ctx, &tbl_attr); + if (!tbl) { + mlx5_core_err(ns->dev, "Failed creating hws flow_table\n"); + return -EINVAL; + } + + ft->fs_hws_table.hws_table = tbl; + ft->id = mlx5hws_table_get_id(tbl); + + if (next_ft) { + err = mlx5_fs_set_ft_default_miss(ns, ft, next_ft); + if (err) + goto destroy_table; + } + + ft->max_fte = INT_MAX; + + err = mlx5_fs_add_flow_table_dest_action(ns, ft); + if (err) + goto clear_ft_miss; + return 0; + +clear_ft_miss: + mlx5_fs_set_ft_default_miss(ns, ft, NULL); +destroy_table: + mlx5hws_table_destroy(tbl); + ft->fs_hws_table.hws_table = NULL; + return err; +} + +static int mlx5_cmd_hws_destroy_flow_table(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft) +{ + int err; + + err = mlx5_fs_del_flow_table_dest_action(ns, ft); + if (err) + mlx5_core_err(ns->dev, "Failed to remove dest action (%d)\n", err); + + if (mlx5_fs_cmd_is_fw_term_table(ft)) + return mlx5_fs_cmd_get_fw_cmds()->destroy_flow_table(ns, ft); + + err = mlx5_fs_set_ft_default_miss(ns, ft, NULL); + if (err) + mlx5_core_err(ns->dev, "Failed to disconnect next table (%d)\n", err); + + err = mlx5hws_table_destroy(ft->fs_hws_table.hws_table); + if (err) + mlx5_core_err(ns->dev, "Failed to destroy flow_table (%d)\n", err); + + return err; +} + +static int mlx5_cmd_hws_modify_flow_table(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_table *next_ft) +{ + if (mlx5_fs_cmd_is_fw_term_table(ft)) + return mlx5_fs_cmd_get_fw_cmds()->modify_flow_table(ns, ft, next_ft); + + return mlx5_fs_set_ft_default_miss(ns, ft, next_ft); +} + +static int mlx5_cmd_hws_update_root_ft(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + u32 underlay_qpn, + bool disconnect) +{ + return mlx5_fs_cmd_get_fw_cmds()->update_root_ft(ns, ft, underlay_qpn, + disconnect); +} + +static int mlx5_cmd_hws_create_flow_group(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, u32 *in, + struct mlx5_flow_group *fg) +{ + struct mlx5hws_match_parameters mask; + struct mlx5hws_bwc_matcher *matcher; + u8 match_criteria_enable; + u32 priority; + + if (mlx5_fs_cmd_is_fw_term_table(ft)) + return mlx5_fs_cmd_get_fw_cmds()->create_flow_group(ns, ft, in, fg); + + mask.match_buf = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + mask.match_sz = sizeof(fg->mask.match_criteria); + + match_criteria_enable = MLX5_GET(create_flow_group_in, in, + match_criteria_enable); + priority = MLX5_GET(create_flow_group_in, in, start_flow_index); + matcher = mlx5hws_bwc_matcher_create(ft->fs_hws_table.hws_table, + priority, match_criteria_enable, + &mask); + if (!matcher) { + mlx5_core_err(ns->dev, "Failed creating matcher\n"); + return -EINVAL; + } + + fg->fs_hws_matcher.matcher = matcher; + return 0; +} + +static int mlx5_cmd_hws_destroy_flow_group(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *fg) +{ + if (mlx5_fs_cmd_is_fw_term_table(ft)) + return mlx5_fs_cmd_get_fw_cmds()->destroy_flow_group(ns, ft, fg); + + return mlx5hws_bwc_matcher_destroy(fg->fs_hws_matcher.matcher); +} + +static struct mlx5hws_action * +mlx5_fs_get_dest_action_ft(struct mlx5_fs_hws_context *fs_ctx, + struct mlx5_flow_rule *dst) +{ + return xa_load(&fs_ctx->hws_pool.table_dests, dst->dest_attr.ft->id); +} + +static struct mlx5hws_action * +mlx5_fs_get_dest_action_table_num(struct mlx5_fs_hws_context *fs_ctx, + struct mlx5_flow_rule *dst) +{ + u32 table_num = dst->dest_attr.ft_num; + + return xa_load(&fs_ctx->hws_pool.table_dests, table_num); +} + +static struct mlx5hws_action * +mlx5_fs_create_dest_action_table_num(struct mlx5_fs_hws_context *fs_ctx, + struct mlx5_flow_rule *dst) +{ + u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED; + struct mlx5hws_context *ctx = fs_ctx->hws_ctx; + u32 table_num = dst->dest_attr.ft_num; + + return mlx5hws_action_create_dest_table_num(ctx, table_num, flags); +} + +static struct mlx5hws_action * +mlx5_fs_get_dest_action_vport(struct mlx5_fs_hws_context *fs_ctx, + struct mlx5_flow_rule *dst, + bool is_dest_type_uplink) +{ + u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED; + struct mlx5_flow_destination *dest_attr = &dst->dest_attr; + struct mlx5hws_context *ctx = fs_ctx->hws_ctx; + struct mlx5hws_action *dest; + struct xarray *dests_xa; + bool vhca_id_valid; + unsigned long idx; + u16 vport_num; + int err; + + vhca_id_valid = is_dest_type_uplink || + (dest_attr->vport.flags & MLX5_FLOW_DEST_VPORT_VHCA_ID); + vport_num = is_dest_type_uplink ? MLX5_VPORT_UPLINK : dest_attr->vport.num; + if (vhca_id_valid) { + dests_xa = &fs_ctx->hws_pool.vport_vhca_dests; + idx = dest_attr->vport.vhca_id << 16 | vport_num; + } else { + dests_xa = &fs_ctx->hws_pool.vport_dests; + idx = vport_num; + } +dest_load: + dest = xa_load(dests_xa, idx); + if (dest) + return dest; + + dest = mlx5hws_action_create_dest_vport(ctx, vport_num, vhca_id_valid, + dest_attr->vport.vhca_id, flags); + + err = xa_insert(dests_xa, idx, dest, GFP_KERNEL); + if (err) { + mlx5hws_action_destroy(dest); + dest = NULL; + + if (err == -EBUSY) + /* xarray entry was already stored by another thread */ + goto dest_load; + } + + return dest; +} + +static struct mlx5hws_action * +mlx5_fs_create_dest_action_range(struct mlx5hws_context *ctx, + struct mlx5_flow_rule *dst) +{ + u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED; + struct mlx5_flow_destination *dest_attr = &dst->dest_attr; + + return mlx5hws_action_create_dest_match_range(ctx, + dest_attr->range.field, + dest_attr->range.hit_ft, + dest_attr->range.miss_ft, + dest_attr->range.min, + dest_attr->range.max, + flags); +} + +static struct mlx5hws_action * +mlx5_fs_create_action_dest_array(struct mlx5hws_context *ctx, + struct mlx5hws_action_dest_attr *dests, + u32 num_of_dests, bool ignore_flow_level, + u32 flow_source) +{ + u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED; + + return mlx5hws_action_create_dest_array(ctx, num_of_dests, dests, + ignore_flow_level, + flow_source, flags); +} + +static struct mlx5hws_action * +mlx5_fs_get_action_push_vlan(struct mlx5_fs_hws_context *fs_ctx) +{ + return fs_ctx->hws_pool.push_vlan_action; +} + +static u32 mlx5_fs_calc_vlan_hdr(struct mlx5_fs_vlan *vlan) +{ + u16 n_ethtype = vlan->ethtype; + u8 prio = vlan->prio; + u16 vid = vlan->vid; + + return (u32)n_ethtype << 16 | (u32)(prio) << 12 | (u32)vid; +} + +static struct mlx5hws_action * +mlx5_fs_get_action_pop_vlan(struct mlx5_fs_hws_context *fs_ctx) +{ + return fs_ctx->hws_pool.pop_vlan_action; +} + +static struct mlx5hws_action * +mlx5_fs_get_action_decap_tnl_l2_to_l2(struct mlx5_fs_hws_context *fs_ctx) +{ + return fs_ctx->hws_pool.decapl2_action; +} + +static struct mlx5hws_action * +mlx5_fs_get_dest_action_drop(struct mlx5_fs_hws_context *fs_ctx) +{ + return fs_ctx->hws_pool.drop_action; +} + +static struct mlx5hws_action * +mlx5_fs_get_action_tag(struct mlx5_fs_hws_context *fs_ctx) +{ + return fs_ctx->hws_pool.tag_action; +} + +static struct mlx5hws_action * +mlx5_fs_create_action_last(struct mlx5hws_context *ctx) +{ + u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED; + + return mlx5hws_action_create_last(ctx, flags); +} + +static void mlx5_fs_destroy_fs_action(struct mlx5_fs_hws_rule_action *fs_action) +{ + switch (mlx5hws_action_get_type(fs_action->action)) { + case MLX5HWS_ACTION_TYP_CTR: + mlx5_fc_put_hws_action(fs_action->counter); + break; + default: + mlx5hws_action_destroy(fs_action->action); + } +} + +static void +mlx5_fs_destroy_fs_actions(struct mlx5_fs_hws_rule_action **fs_actions, + int *num_fs_actions) +{ + int i; + + /* Free in reverse order to handle action dependencies */ + for (i = *num_fs_actions - 1; i >= 0; i--) + mlx5_fs_destroy_fs_action(*fs_actions + i); + *num_fs_actions = 0; + kfree(*fs_actions); + *fs_actions = NULL; +} + +/* Splits FTE's actions into cached, rule and destination actions. + * The cached and destination actions are saved on the fte hws rule. + * The rule actions are returned as a parameter, together with their count. + * We want to support a rule with 32 destinations, which means we need to + * account for 32 destinations plus usually a counter plus one more action + * for a multi-destination flow table. + * 32 is SW limitation for array size, keep. HWS limitation is 16M STEs per matcher + */ +#define MLX5_FLOW_CONTEXT_ACTION_MAX 34 +static int mlx5_fs_fte_get_hws_actions(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *group, + struct fs_fte *fte, + struct mlx5hws_rule_action **ractions) +{ + struct mlx5_flow_act *fte_action = &fte->act_dests.action; + struct mlx5_fs_hws_context *fs_ctx = &ns->fs_hws_context; + struct mlx5hws_action_dest_attr *dest_actions; + struct mlx5hws_context *ctx = fs_ctx->hws_ctx; + struct mlx5_fs_hws_rule_action *fs_actions; + struct mlx5_core_dev *dev = ns->dev; + struct mlx5hws_action *dest_action; + struct mlx5hws_action *tmp_action; + struct mlx5_fs_hws_pr *pr_data; + struct mlx5_fs_hws_mh *mh_data; + bool delay_encap_set = false; + struct mlx5_flow_rule *dst; + int num_dest_actions = 0; + int num_fs_actions = 0; + int num_actions = 0; + int err; + + *ractions = kcalloc(MLX5_FLOW_CONTEXT_ACTION_MAX, sizeof(**ractions), + GFP_KERNEL); + if (!*ractions) { + err = -ENOMEM; + goto out_err; + } + + fs_actions = kcalloc(MLX5_FLOW_CONTEXT_ACTION_MAX, + sizeof(*fs_actions), GFP_KERNEL); + if (!fs_actions) { + err = -ENOMEM; + goto free_actions_alloc; + } + + dest_actions = kcalloc(MLX5_FLOW_CONTEXT_ACTION_MAX, + sizeof(*dest_actions), GFP_KERNEL); + if (!dest_actions) { + err = -ENOMEM; + goto free_fs_actions_alloc; + } + + /* The order of the actions are must to be kept, only the following + * order is supported by HW steering: + * HWS: decap -> remove_hdr -> pop_vlan -> modify header -> push_vlan + * -> reformat (insert_hdr/encap) -> ctr -> tag -> aso + * -> drop -> FWD:tbl/vport/sampler/tbl_num/range -> dest_array -> last + */ + if (fte_action->action & MLX5_FLOW_CONTEXT_ACTION_DECAP) { + tmp_action = mlx5_fs_get_action_decap_tnl_l2_to_l2(fs_ctx); + if (!tmp_action) { + err = -ENOMEM; + goto free_dest_actions_alloc; + } + (*ractions)[num_actions++].action = tmp_action; + } + + if (fte_action->action & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT) { + int reformat_type = fte_action->pkt_reformat->reformat_type; + + if (fte_action->pkt_reformat->owner == MLX5_FLOW_RESOURCE_OWNER_FW) { + mlx5_core_err(dev, "FW-owned reformat can't be used in HWS rule\n"); + err = -EINVAL; + goto free_actions; + } + + if (reformat_type == MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2) { + pr_data = fte_action->pkt_reformat->fs_hws_action.pr_data; + (*ractions)[num_actions].reformat.offset = pr_data->offset; + (*ractions)[num_actions].reformat.hdr_idx = pr_data->hdr_idx; + (*ractions)[num_actions].reformat.data = pr_data->data; + (*ractions)[num_actions++].action = + fte_action->pkt_reformat->fs_hws_action.hws_action; + } else if (reformat_type == MLX5_REFORMAT_TYPE_REMOVE_HDR) { + (*ractions)[num_actions++].action = + fte_action->pkt_reformat->fs_hws_action.hws_action; + } else { + delay_encap_set = true; + } + } + + if (fte_action->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_POP) { + tmp_action = mlx5_fs_get_action_pop_vlan(fs_ctx); + if (!tmp_action) { + err = -ENOMEM; + goto free_actions; + } + (*ractions)[num_actions++].action = tmp_action; + } + + if (fte_action->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_POP_2) { + tmp_action = mlx5_fs_get_action_pop_vlan(fs_ctx); + if (!tmp_action) { + err = -ENOMEM; + goto free_actions; + } + (*ractions)[num_actions++].action = tmp_action; + } + + if (fte_action->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) { + mh_data = fte_action->modify_hdr->fs_hws_action.mh_data; + (*ractions)[num_actions].modify_header.offset = mh_data->offset; + (*ractions)[num_actions].modify_header.data = mh_data->data; + (*ractions)[num_actions++].action = + fte_action->modify_hdr->fs_hws_action.hws_action; + } + + if (fte_action->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH) { + tmp_action = mlx5_fs_get_action_push_vlan(fs_ctx); + if (!tmp_action) { + err = -ENOMEM; + goto free_actions; + } + (*ractions)[num_actions].push_vlan.vlan_hdr = + htonl(mlx5_fs_calc_vlan_hdr(&fte_action->vlan[0])); + (*ractions)[num_actions++].action = tmp_action; + } + + if (fte_action->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2) { + tmp_action = mlx5_fs_get_action_push_vlan(fs_ctx); + if (!tmp_action) { + err = -ENOMEM; + goto free_actions; + } + (*ractions)[num_actions].push_vlan.vlan_hdr = + htonl(mlx5_fs_calc_vlan_hdr(&fte_action->vlan[1])); + (*ractions)[num_actions++].action = tmp_action; + } + + if (delay_encap_set) { + pr_data = fte_action->pkt_reformat->fs_hws_action.pr_data; + (*ractions)[num_actions].reformat.offset = pr_data->offset; + (*ractions)[num_actions].reformat.data = pr_data->data; + (*ractions)[num_actions++].action = + fte_action->pkt_reformat->fs_hws_action.hws_action; + } + + if (fte_action->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + list_for_each_entry(dst, &fte->node.children, node.list) { + struct mlx5_fc *counter; + + if (dst->dest_attr.type != + MLX5_FLOW_DESTINATION_TYPE_COUNTER) + continue; + + if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) { + err = -EOPNOTSUPP; + goto free_actions; + } + + counter = dst->dest_attr.counter; + tmp_action = mlx5_fc_get_hws_action(ctx, counter); + if (!tmp_action) { + err = -EINVAL; + goto free_actions; + } + + (*ractions)[num_actions].counter.offset = + mlx5_fc_id(counter) - mlx5_fc_get_base_id(counter); + (*ractions)[num_actions++].action = tmp_action; + fs_actions[num_fs_actions].action = tmp_action; + fs_actions[num_fs_actions++].counter = counter; + } + } + + if (fte->act_dests.flow_context.flow_tag) { + if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) { + err = -EOPNOTSUPP; + goto free_actions; + } + tmp_action = mlx5_fs_get_action_tag(fs_ctx); + if (!tmp_action) { + err = -ENOMEM; + goto free_actions; + } + (*ractions)[num_actions].tag.value = fte->act_dests.flow_context.flow_tag; + (*ractions)[num_actions++].action = tmp_action; + } + + if (fte_action->action & MLX5_FLOW_CONTEXT_ACTION_EXECUTE_ASO) { + err = -EOPNOTSUPP; + goto free_actions; + } + + if (fte_action->action & MLX5_FLOW_CONTEXT_ACTION_DROP) { + dest_action = mlx5_fs_get_dest_action_drop(fs_ctx); + if (!dest_action) { + err = -ENOMEM; + goto free_actions; + } + dest_actions[num_dest_actions++].dest = dest_action; + } + + if (fte_action->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) { + list_for_each_entry(dst, &fte->node.children, node.list) { + struct mlx5_flow_destination *attr = &dst->dest_attr; + bool type_uplink = + attr->type == MLX5_FLOW_DESTINATION_TYPE_UPLINK; + + if (num_fs_actions == MLX5_FLOW_CONTEXT_ACTION_MAX || + num_dest_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) { + err = -EOPNOTSUPP; + goto free_actions; + } + if (attr->type == MLX5_FLOW_DESTINATION_TYPE_COUNTER) + continue; + + switch (attr->type) { + case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE: + dest_action = mlx5_fs_get_dest_action_ft(fs_ctx, dst); + break; + case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM: + dest_action = mlx5_fs_get_dest_action_table_num(fs_ctx, + dst); + if (dest_action) + break; + dest_action = mlx5_fs_create_dest_action_table_num(fs_ctx, + dst); + fs_actions[num_fs_actions++].action = dest_action; + break; + case MLX5_FLOW_DESTINATION_TYPE_RANGE: + dest_action = mlx5_fs_create_dest_action_range(ctx, dst); + fs_actions[num_fs_actions++].action = dest_action; + break; + case MLX5_FLOW_DESTINATION_TYPE_UPLINK: + case MLX5_FLOW_DESTINATION_TYPE_VPORT: + dest_action = mlx5_fs_get_dest_action_vport(fs_ctx, dst, + type_uplink); + break; + default: + err = -EOPNOTSUPP; + goto free_actions; + } + if (!dest_action) { + err = -ENOMEM; + goto free_actions; + } + dest_actions[num_dest_actions++].dest = dest_action; + } + } + + if (num_dest_actions == 1) { + if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) { + err = -EOPNOTSUPP; + goto free_actions; + } + (*ractions)[num_actions++].action = dest_actions->dest; + } else if (num_dest_actions > 1) { + u32 flow_source = fte->act_dests.flow_context.flow_source; + bool ignore_flow_level; + + if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX || + num_fs_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) { + err = -EOPNOTSUPP; + goto free_actions; + } + ignore_flow_level = + !!(fte_action->flags & FLOW_ACT_IGNORE_FLOW_LEVEL); + tmp_action = mlx5_fs_create_action_dest_array(ctx, dest_actions, + num_dest_actions, + ignore_flow_level, + flow_source); + if (!tmp_action) { + err = -EOPNOTSUPP; + goto free_actions; + } + fs_actions[num_fs_actions++].action = tmp_action; + (*ractions)[num_actions++].action = tmp_action; + } + + if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX || + num_fs_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) { + err = -EOPNOTSUPP; + goto free_actions; + } + + tmp_action = mlx5_fs_create_action_last(ctx); + if (!tmp_action) { + err = -ENOMEM; + goto free_actions; + } + fs_actions[num_fs_actions++].action = tmp_action; + (*ractions)[num_actions++].action = tmp_action; + + kfree(dest_actions); + + /* Actions created specifically for this rule will be destroyed + * once rule is deleted. + */ + fte->fs_hws_rule.num_fs_actions = num_fs_actions; + fte->fs_hws_rule.hws_fs_actions = fs_actions; + + return 0; + +free_actions: + mlx5_fs_destroy_fs_actions(&fs_actions, &num_fs_actions); +free_dest_actions_alloc: + kfree(dest_actions); +free_fs_actions_alloc: + kfree(fs_actions); +free_actions_alloc: + kfree(*ractions); + *ractions = NULL; +out_err: + return err; +} + +static int mlx5_cmd_hws_create_fte(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *group, + struct fs_fte *fte) +{ + struct mlx5hws_match_parameters params; + struct mlx5hws_rule_action *ractions; + struct mlx5hws_bwc_rule *rule; + int err = 0; + + if (mlx5_fs_cmd_is_fw_term_table(ft)) { + /* Packet reformat on terminamtion table not supported yet */ + if (fte->act_dests.action.action & + MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT) + return -EOPNOTSUPP; + return mlx5_fs_cmd_get_fw_cmds()->create_fte(ns, ft, group, fte); + } + + err = mlx5_fs_fte_get_hws_actions(ns, ft, group, fte, &ractions); + if (err) + goto out_err; + + params.match_sz = sizeof(fte->val); + params.match_buf = fte->val; + + rule = mlx5hws_bwc_rule_create(group->fs_hws_matcher.matcher, ¶ms, + fte->act_dests.flow_context.flow_source, + ractions); + kfree(ractions); + if (!rule) { + err = -EINVAL; + goto free_actions; + } + + fte->fs_hws_rule.bwc_rule = rule; + return 0; + +free_actions: + mlx5_fs_destroy_fs_actions(&fte->fs_hws_rule.hws_fs_actions, + &fte->fs_hws_rule.num_fs_actions); +out_err: + mlx5_core_err(ns->dev, "Failed to create hws rule err(%d)\n", err); + return err; +} + +static int mlx5_cmd_hws_delete_fte(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct fs_fte *fte) +{ + struct mlx5_fs_hws_rule *rule = &fte->fs_hws_rule; + int err; + + if (mlx5_fs_cmd_is_fw_term_table(ft)) + return mlx5_fs_cmd_get_fw_cmds()->delete_fte(ns, ft, fte); + + err = mlx5hws_bwc_rule_destroy(rule->bwc_rule); + rule->bwc_rule = NULL; + + mlx5_fs_destroy_fs_actions(&rule->hws_fs_actions, &rule->num_fs_actions); + + return err; +} + +static int mlx5_cmd_hws_update_fte(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *group, + int modify_mask, + struct fs_fte *fte) +{ + int allowed_mask = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION) | + BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST) | + BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS); + struct mlx5_fs_hws_rule_action *saved_hws_fs_actions; + struct mlx5hws_rule_action *ractions; + int saved_num_fs_actions; + int ret; + + if (mlx5_fs_cmd_is_fw_term_table(ft)) + return mlx5_fs_cmd_get_fw_cmds()->update_fte(ns, ft, group, + modify_mask, fte); + + if ((modify_mask & ~allowed_mask) != 0) + return -EINVAL; + + saved_hws_fs_actions = fte->fs_hws_rule.hws_fs_actions; + saved_num_fs_actions = fte->fs_hws_rule.num_fs_actions; + + ret = mlx5_fs_fte_get_hws_actions(ns, ft, group, fte, &ractions); + if (ret) + return ret; + + ret = mlx5hws_bwc_rule_action_update(fte->fs_hws_rule.bwc_rule, ractions); + kfree(ractions); + if (ret) + goto restore_actions; + + mlx5_fs_destroy_fs_actions(&saved_hws_fs_actions, &saved_num_fs_actions); + return ret; + +restore_actions: + mlx5_fs_destroy_fs_actions(&fte->fs_hws_rule.hws_fs_actions, + &fte->fs_hws_rule.num_fs_actions); + fte->fs_hws_rule.hws_fs_actions = saved_hws_fs_actions; + fte->fs_hws_rule.num_fs_actions = saved_num_fs_actions; + return ret; +} + +static struct mlx5hws_action * +mlx5_fs_create_action_remove_header_vlan(struct mlx5hws_context *ctx) +{ + u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED; + struct mlx5hws_action_remove_header_attr remove_hdr_vlan = {}; + + /* MAC anchor not supported in HWS reformat, use VLAN anchor */ + remove_hdr_vlan.anchor = MLX5_REFORMAT_CONTEXT_ANCHOR_VLAN_START; + remove_hdr_vlan.offset = 0; + remove_hdr_vlan.size = sizeof(struct vlan_hdr); + return mlx5hws_action_create_remove_header(ctx, &remove_hdr_vlan, flags); +} + +static struct mlx5hws_action * +mlx5_fs_get_action_remove_header_vlan(struct mlx5_fs_hws_context *fs_ctx, + struct mlx5_pkt_reformat_params *params) +{ + if (!params || + params->param_0 != MLX5_REFORMAT_CONTEXT_ANCHOR_MAC_START || + params->param_1 != offsetof(struct vlan_ethhdr, h_vlan_proto) || + params->size != sizeof(struct vlan_hdr)) + return NULL; + + return fs_ctx->hws_pool.remove_hdr_vlan_action; +} + +static int +mlx5_fs_verify_insert_header_params(struct mlx5_core_dev *mdev, + struct mlx5_pkt_reformat_params *params) +{ + if ((!params->data && params->size) || (params->data && !params->size) || + MLX5_CAP_GEN_2(mdev, max_reformat_insert_size) < params->size || + MLX5_CAP_GEN_2(mdev, max_reformat_insert_offset) < params->param_1) { + mlx5_core_err(mdev, "Invalid reformat params for INSERT_HDR\n"); + return -EINVAL; + } + if (params->param_0 != MLX5_FS_INSERT_HDR_VLAN_ANCHOR || + params->param_1 != MLX5_FS_INSERT_HDR_VLAN_OFFSET || + params->size != MLX5_FS_INSERT_HDR_VLAN_SIZE) { + mlx5_core_err(mdev, "Only vlan insert header supported\n"); + return -EOPNOTSUPP; + } + return 0; +} + +static int +mlx5_fs_verify_encap_decap_params(struct mlx5_core_dev *dev, + struct mlx5_pkt_reformat_params *params) +{ + if (params->param_0 || params->param_1) { + mlx5_core_err(dev, "Invalid reformat params\n"); + return -EINVAL; + } + return 0; +} + +static struct mlx5_fs_pool * +mlx5_fs_get_pr_encap_pool(struct mlx5_core_dev *dev, struct xarray *pr_pools, + enum mlx5hws_action_type reformat_type, size_t size) +{ + struct mlx5_fs_pool *pr_pool; + unsigned long index = size; + int err; + + pr_pool = xa_load(pr_pools, index); + if (pr_pool) + return pr_pool; + + pr_pool = kzalloc(sizeof(*pr_pool), GFP_KERNEL); + if (!pr_pool) + return ERR_PTR(-ENOMEM); + err = mlx5_fs_hws_pr_pool_init(pr_pool, dev, size, reformat_type); + if (err) + goto free_pr_pool; + err = xa_insert(pr_pools, index, pr_pool, GFP_KERNEL); + if (err) + goto cleanup_pr_pool; + return pr_pool; + +cleanup_pr_pool: + mlx5_fs_hws_pr_pool_cleanup(pr_pool); +free_pr_pool: + kfree(pr_pool); + return ERR_PTR(err); +} + +static void +mlx5_fs_destroy_pr_pool(struct mlx5_fs_pool *pool, struct xarray *pr_pools, + unsigned long index) +{ + xa_erase(pr_pools, index); + mlx5_fs_hws_pr_pool_cleanup(pool); + kfree(pool); +} + +static int +mlx5_cmd_hws_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns, + struct mlx5_pkt_reformat_params *params, + enum mlx5_flow_namespace_type namespace, + struct mlx5_pkt_reformat *pkt_reformat) +{ + struct mlx5_fs_hws_context *fs_ctx = &ns->fs_hws_context; + struct mlx5_fs_hws_actions_pool *hws_pool; + struct mlx5hws_action *hws_action = NULL; + struct mlx5_fs_hws_pr *pr_data = NULL; + struct mlx5_fs_pool *pr_pool = NULL; + struct mlx5_core_dev *dev = ns->dev; + u8 hdr_idx = 0; + int err; + + if (!params) + return -EINVAL; + + hws_pool = &fs_ctx->hws_pool; + + switch (params->type) { + case MLX5_REFORMAT_TYPE_L2_TO_VXLAN: + case MLX5_REFORMAT_TYPE_L2_TO_NVGRE: + case MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL: + if (mlx5_fs_verify_encap_decap_params(dev, params)) + return -EINVAL; + pr_pool = mlx5_fs_get_pr_encap_pool(dev, &hws_pool->el2tol2tnl_pools, + MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L2, + params->size); + if (IS_ERR(pr_pool)) + return PTR_ERR(pr_pool); + break; + case MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL: + if (mlx5_fs_verify_encap_decap_params(dev, params)) + return -EINVAL; + pr_pool = mlx5_fs_get_pr_encap_pool(dev, &hws_pool->el2tol3tnl_pools, + MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L3, + params->size); + if (IS_ERR(pr_pool)) + return PTR_ERR(pr_pool); + break; + case MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2: + if (mlx5_fs_verify_encap_decap_params(dev, params)) + return -EINVAL; + pr_pool = &hws_pool->dl3tnltol2_pool; + hdr_idx = params->size == ETH_HLEN ? + MLX5_FS_DL3TNLTOL2_MAC_HDR_IDX : + MLX5_FS_DL3TNLTOL2_MAC_VLAN_HDR_IDX; + break; + case MLX5_REFORMAT_TYPE_INSERT_HDR: + err = mlx5_fs_verify_insert_header_params(dev, params); + if (err) + return err; + pr_pool = &hws_pool->insert_hdr_pool; + break; + case MLX5_REFORMAT_TYPE_REMOVE_HDR: + hws_action = mlx5_fs_get_action_remove_header_vlan(fs_ctx, params); + if (!hws_action) + mlx5_core_err(dev, "Only vlan remove header supported\n"); + break; + default: + mlx5_core_err(ns->dev, "Packet-reformat not supported(%d)\n", + params->type); + return -EOPNOTSUPP; + } + + if (pr_pool) { + pr_data = mlx5_fs_hws_pr_pool_acquire_pr(pr_pool); + if (IS_ERR_OR_NULL(pr_data)) + return !pr_data ? -EINVAL : PTR_ERR(pr_data); + hws_action = pr_data->bulk->hws_action; + if (!hws_action) { + mlx5_core_err(dev, + "Failed allocating packet-reformat action\n"); + err = -EINVAL; + goto release_pr; + } + pr_data->data = kmemdup(params->data, params->size, GFP_KERNEL); + if (!pr_data->data) { + err = -ENOMEM; + goto release_pr; + } + pr_data->hdr_idx = hdr_idx; + pr_data->data_size = params->size; + pkt_reformat->fs_hws_action.pr_data = pr_data; + } + + pkt_reformat->owner = MLX5_FLOW_RESOURCE_OWNER_SW; + pkt_reformat->fs_hws_action.hws_action = hws_action; + return 0; + +release_pr: + if (pr_pool && pr_data) + mlx5_fs_hws_pr_pool_release_pr(pr_pool, pr_data); + return err; +} + +static void mlx5_cmd_hws_packet_reformat_dealloc(struct mlx5_flow_root_namespace *ns, + struct mlx5_pkt_reformat *pkt_reformat) +{ + struct mlx5_fs_hws_actions_pool *hws_pool = &ns->fs_hws_context.hws_pool; + struct mlx5_core_dev *dev = ns->dev; + struct mlx5_fs_hws_pr *pr_data; + struct mlx5_fs_pool *pr_pool; + + if (pkt_reformat->reformat_type == MLX5_REFORMAT_TYPE_REMOVE_HDR) + return; + + if (!pkt_reformat->fs_hws_action.pr_data) { + mlx5_core_err(ns->dev, "Failed release packet-reformat\n"); + return; + } + pr_data = pkt_reformat->fs_hws_action.pr_data; + + switch (pkt_reformat->reformat_type) { + case MLX5_REFORMAT_TYPE_L2_TO_VXLAN: + case MLX5_REFORMAT_TYPE_L2_TO_NVGRE: + case MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL: + pr_pool = mlx5_fs_get_pr_encap_pool(dev, &hws_pool->el2tol2tnl_pools, + MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L2, + pr_data->data_size); + break; + case MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL: + pr_pool = mlx5_fs_get_pr_encap_pool(dev, &hws_pool->el2tol2tnl_pools, + MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L2, + pr_data->data_size); + break; + case MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2: + pr_pool = &hws_pool->dl3tnltol2_pool; + break; + case MLX5_REFORMAT_TYPE_INSERT_HDR: + pr_pool = &hws_pool->insert_hdr_pool; + break; + default: + mlx5_core_err(ns->dev, "Unknown packet-reformat type\n"); + return; + } + if (!pkt_reformat->fs_hws_action.pr_data || IS_ERR(pr_pool)) { + mlx5_core_err(ns->dev, "Failed release packet-reformat\n"); + return; + } + kfree(pr_data->data); + mlx5_fs_hws_pr_pool_release_pr(pr_pool, pr_data); + pkt_reformat->fs_hws_action.pr_data = NULL; +} + +static struct mlx5_fs_pool * +mlx5_fs_create_mh_pool(struct mlx5_core_dev *dev, + struct mlx5hws_action_mh_pattern *pattern, + struct xarray *mh_pools, unsigned long index) +{ + struct mlx5_fs_pool *pool; + int err; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return ERR_PTR(-ENOMEM); + err = mlx5_fs_hws_mh_pool_init(pool, dev, pattern); + if (err) + goto free_pool; + err = xa_insert(mh_pools, index, pool, GFP_KERNEL); + if (err) + goto cleanup_pool; + return pool; + +cleanup_pool: + mlx5_fs_hws_mh_pool_cleanup(pool); +free_pool: + kfree(pool); + return ERR_PTR(err); +} + +static void +mlx5_fs_destroy_mh_pool(struct mlx5_fs_pool *pool, struct xarray *mh_pools, + unsigned long index) +{ + xa_erase(mh_pools, index); + mlx5_fs_hws_mh_pool_cleanup(pool); + kfree(pool); +} + +static int mlx5_cmd_hws_modify_header_alloc(struct mlx5_flow_root_namespace *ns, + u8 namespace, u8 num_actions, + void *modify_actions, + struct mlx5_modify_hdr *modify_hdr) +{ + struct mlx5_fs_hws_actions_pool *hws_pool = &ns->fs_hws_context.hws_pool; + struct mlx5hws_action_mh_pattern pattern = {}; + struct mlx5_fs_hws_mh *mh_data = NULL; + struct mlx5hws_action *hws_action; + struct mlx5_fs_pool *pool; + unsigned long i, cnt = 0; + bool known_pattern; + int err; + + pattern.sz = MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto) * num_actions; + pattern.data = modify_actions; + + known_pattern = false; + xa_for_each(&hws_pool->mh_pools, i, pool) { + if (mlx5_fs_hws_mh_pool_match(pool, &pattern)) { + known_pattern = true; + break; + } + cnt++; + } + + if (!known_pattern) { + pool = mlx5_fs_create_mh_pool(ns->dev, &pattern, + &hws_pool->mh_pools, cnt); + if (IS_ERR(pool)) + return PTR_ERR(pool); + } + mh_data = mlx5_fs_hws_mh_pool_acquire_mh(pool); + if (IS_ERR(mh_data)) { + err = PTR_ERR(mh_data); + goto destroy_pool; + } + hws_action = mh_data->bulk->hws_action; + mh_data->data = kmemdup(pattern.data, pattern.sz, GFP_KERNEL); + if (!mh_data->data) { + err = -ENOMEM; + goto release_mh; + } + modify_hdr->fs_hws_action.mh_data = mh_data; + modify_hdr->fs_hws_action.fs_pool = pool; + modify_hdr->owner = MLX5_FLOW_RESOURCE_OWNER_SW; + modify_hdr->fs_hws_action.hws_action = hws_action; + + return 0; + +release_mh: + mlx5_fs_hws_mh_pool_release_mh(pool, mh_data); +destroy_pool: + if (!known_pattern) + mlx5_fs_destroy_mh_pool(pool, &hws_pool->mh_pools, cnt); + return err; +} + +static void mlx5_cmd_hws_modify_header_dealloc(struct mlx5_flow_root_namespace *ns, + struct mlx5_modify_hdr *modify_hdr) +{ + struct mlx5_fs_hws_mh *mh_data; + struct mlx5_fs_pool *pool; + + if (!modify_hdr->fs_hws_action.fs_pool || !modify_hdr->fs_hws_action.mh_data) { + mlx5_core_err(ns->dev, "Failed release modify-header\n"); + return; + } + + mh_data = modify_hdr->fs_hws_action.mh_data; + kfree(mh_data->data); + pool = modify_hdr->fs_hws_action.fs_pool; + mlx5_fs_hws_mh_pool_release_mh(pool, mh_data); + modify_hdr->fs_hws_action.mh_data = NULL; +} + +static int mlx5_cmd_hws_create_match_definer(struct mlx5_flow_root_namespace *ns, + u16 format_id, u32 *match_mask) +{ + return -EOPNOTSUPP; +} + +static int mlx5_cmd_hws_destroy_match_definer(struct mlx5_flow_root_namespace *ns, + int definer_id) +{ + return -EOPNOTSUPP; +} + +static u32 mlx5_cmd_hws_get_capabilities(struct mlx5_flow_root_namespace *ns, + enum fs_flow_table_type ft_type) +{ + if (ft_type != FS_FT_FDB) + return 0; + + return MLX5_FLOW_STEERING_CAP_VLAN_PUSH_ON_RX | + MLX5_FLOW_STEERING_CAP_VLAN_POP_ON_TX | + MLX5_FLOW_STEERING_CAP_MATCH_RANGES; +} + +bool mlx5_fs_hws_is_supported(struct mlx5_core_dev *dev) +{ + return mlx5hws_is_supported(dev); +} + +static const struct mlx5_flow_cmds mlx5_flow_cmds_hws = { + .create_flow_table = mlx5_cmd_hws_create_flow_table, + .destroy_flow_table = mlx5_cmd_hws_destroy_flow_table, + .modify_flow_table = mlx5_cmd_hws_modify_flow_table, + .update_root_ft = mlx5_cmd_hws_update_root_ft, + .create_flow_group = mlx5_cmd_hws_create_flow_group, + .destroy_flow_group = mlx5_cmd_hws_destroy_flow_group, + .create_fte = mlx5_cmd_hws_create_fte, + .delete_fte = mlx5_cmd_hws_delete_fte, + .update_fte = mlx5_cmd_hws_update_fte, + .packet_reformat_alloc = mlx5_cmd_hws_packet_reformat_alloc, + .packet_reformat_dealloc = mlx5_cmd_hws_packet_reformat_dealloc, + .modify_header_alloc = mlx5_cmd_hws_modify_header_alloc, + .modify_header_dealloc = mlx5_cmd_hws_modify_header_dealloc, + .create_match_definer = mlx5_cmd_hws_create_match_definer, + .destroy_match_definer = mlx5_cmd_hws_destroy_match_definer, + .create_ns = mlx5_cmd_hws_create_ns, + .destroy_ns = mlx5_cmd_hws_destroy_ns, + .set_peer = mlx5_cmd_hws_set_peer, + .get_capabilities = mlx5_cmd_hws_get_capabilities, +}; + +const struct mlx5_flow_cmds *mlx5_fs_cmd_get_hws_cmds(void) +{ + return &mlx5_flow_cmds_hws; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.h new file mode 100644 index 000000000000..cbddb72d4362 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2025 NVIDIA Corporation & Affiliates */ + +#ifndef _MLX5_FS_HWS_ +#define _MLX5_FS_HWS_ + +#include "mlx5hws.h" +#include "fs_hws_pools.h" + +struct mlx5_fs_hws_actions_pool { + struct mlx5hws_action *tag_action; + struct mlx5hws_action *pop_vlan_action; + struct mlx5hws_action *push_vlan_action; + struct mlx5hws_action *drop_action; + struct mlx5hws_action *decapl2_action; + struct mlx5hws_action *remove_hdr_vlan_action; + struct mlx5_fs_pool insert_hdr_pool; + struct mlx5_fs_pool dl3tnltol2_pool; + struct xarray el2tol3tnl_pools; + struct xarray el2tol2tnl_pools; + struct xarray mh_pools; + struct xarray table_dests; + struct xarray vport_vhca_dests; + struct xarray vport_dests; +}; + +struct mlx5_fs_hws_context { + struct mlx5hws_context *hws_ctx; + struct mlx5_fs_hws_actions_pool hws_pool; +}; + +struct mlx5_fs_hws_table { + struct mlx5hws_table *hws_table; + bool miss_ft_set; +}; + +struct mlx5_fs_hws_action { + struct mlx5hws_action *hws_action; + struct mlx5_fs_pool *fs_pool; + struct mlx5_fs_hws_pr *pr_data; + struct mlx5_fs_hws_mh *mh_data; +}; + +struct mlx5_fs_hws_matcher { + struct mlx5hws_bwc_matcher *matcher; +}; + +struct mlx5_fs_hws_rule_action { + struct mlx5hws_action *action; + union { + struct mlx5_fc *counter; + }; +}; + +struct mlx5_fs_hws_rule { + struct mlx5hws_bwc_rule *bwc_rule; + struct mlx5_fs_hws_rule_action *hws_fs_actions; + int num_fs_actions; +}; + +#ifdef CONFIG_MLX5_HW_STEERING + +bool mlx5_fs_hws_is_supported(struct mlx5_core_dev *dev); + +const struct mlx5_flow_cmds *mlx5_fs_cmd_get_hws_cmds(void); + +#else + +static inline bool mlx5_fs_hws_is_supported(struct mlx5_core_dev *dev) +{ + return false; +} + +static inline const struct mlx5_flow_cmds *mlx5_fs_cmd_get_hws_cmds(void) +{ + return NULL; +} + +#endif /* CONFIG_MLX5_HWS_STEERING */ +#endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c new file mode 100644 index 000000000000..2ae4ac62b0e2 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c @@ -0,0 +1,450 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2025 NVIDIA Corporation & Affiliates */ + +#include <mlx5_core.h> +#include "fs_hws_pools.h" + +#define MLX5_FS_HWS_DEFAULT_BULK_LEN 65536 +#define MLX5_FS_HWS_POOL_MAX_THRESHOLD BIT(18) +#define MLX5_FS_HWS_POOL_USED_BUFF_RATIO 10 + +static struct mlx5hws_action * +mlx5_fs_dl3tnltol2_bulk_action_create(struct mlx5hws_context *ctx) +{ + struct mlx5hws_action_reformat_header reformat_hdr[2] = {}; + u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB; + enum mlx5hws_action_type reformat_type; + u32 log_bulk_size; + + reformat_type = MLX5HWS_ACTION_TYP_REFORMAT_TNL_L3_TO_L2; + reformat_hdr[MLX5_FS_DL3TNLTOL2_MAC_HDR_IDX].sz = ETH_HLEN; + reformat_hdr[MLX5_FS_DL3TNLTOL2_MAC_VLAN_HDR_IDX].sz = ETH_HLEN + VLAN_HLEN; + + log_bulk_size = ilog2(MLX5_FS_HWS_DEFAULT_BULK_LEN); + return mlx5hws_action_create_reformat(ctx, reformat_type, 2, + reformat_hdr, log_bulk_size, flags); +} + +static struct mlx5hws_action * +mlx5_fs_el2tol3tnl_bulk_action_create(struct mlx5hws_context *ctx, size_t data_size) +{ + struct mlx5hws_action_reformat_header reformat_hdr = {}; + u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB; + enum mlx5hws_action_type reformat_type; + u32 log_bulk_size; + + reformat_type = MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L3; + reformat_hdr.sz = data_size; + + log_bulk_size = ilog2(MLX5_FS_HWS_DEFAULT_BULK_LEN); + return mlx5hws_action_create_reformat(ctx, reformat_type, 1, + &reformat_hdr, log_bulk_size, flags); +} + +static struct mlx5hws_action * +mlx5_fs_el2tol2tnl_bulk_action_create(struct mlx5hws_context *ctx, size_t data_size) +{ + struct mlx5hws_action_reformat_header reformat_hdr = {}; + u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB; + enum mlx5hws_action_type reformat_type; + u32 log_bulk_size; + + reformat_type = MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L2; + reformat_hdr.sz = data_size; + + log_bulk_size = ilog2(MLX5_FS_HWS_DEFAULT_BULK_LEN); + return mlx5hws_action_create_reformat(ctx, reformat_type, 1, + &reformat_hdr, log_bulk_size, flags); +} + +static struct mlx5hws_action * +mlx5_fs_insert_hdr_bulk_action_create(struct mlx5hws_context *ctx) +{ + struct mlx5hws_action_insert_header insert_hdr = {}; + u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB; + u32 log_bulk_size; + + log_bulk_size = ilog2(MLX5_FS_HWS_DEFAULT_BULK_LEN); + insert_hdr.hdr.sz = MLX5_FS_INSERT_HDR_VLAN_SIZE; + insert_hdr.anchor = MLX5_FS_INSERT_HDR_VLAN_ANCHOR; + insert_hdr.offset = MLX5_FS_INSERT_HDR_VLAN_OFFSET; + + return mlx5hws_action_create_insert_header(ctx, 1, &insert_hdr, + log_bulk_size, flags); +} + +static struct mlx5hws_action * +mlx5_fs_pr_bulk_action_create(struct mlx5_core_dev *dev, + struct mlx5_fs_hws_pr_pool_ctx *pr_pool_ctx) +{ + struct mlx5_flow_root_namespace *root_ns; + struct mlx5hws_context *ctx; + size_t encap_data_size; + + root_ns = mlx5_get_root_namespace(dev, MLX5_FLOW_NAMESPACE_FDB); + if (!root_ns || root_ns->mode != MLX5_FLOW_STEERING_MODE_HMFS) + return NULL; + + ctx = root_ns->fs_hws_context.hws_ctx; + if (!ctx) + return NULL; + + encap_data_size = pr_pool_ctx->encap_data_size; + switch (pr_pool_ctx->reformat_type) { + case MLX5HWS_ACTION_TYP_REFORMAT_TNL_L3_TO_L2: + return mlx5_fs_dl3tnltol2_bulk_action_create(ctx); + case MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L3: + return mlx5_fs_el2tol3tnl_bulk_action_create(ctx, encap_data_size); + case MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L2: + return mlx5_fs_el2tol2tnl_bulk_action_create(ctx, encap_data_size); + case MLX5HWS_ACTION_TYP_INSERT_HEADER: + return mlx5_fs_insert_hdr_bulk_action_create(ctx); + default: + return NULL; + } + return NULL; +} + +static struct mlx5_fs_bulk * +mlx5_fs_hws_pr_bulk_create(struct mlx5_core_dev *dev, void *pool_ctx) +{ + struct mlx5_fs_hws_pr_pool_ctx *pr_pool_ctx; + struct mlx5_fs_hws_pr_bulk *pr_bulk; + int bulk_len; + int i; + + if (!pool_ctx) + return NULL; + pr_pool_ctx = pool_ctx; + bulk_len = MLX5_FS_HWS_DEFAULT_BULK_LEN; + pr_bulk = kvzalloc(struct_size(pr_bulk, prs_data, bulk_len), GFP_KERNEL); + if (!pr_bulk) + return NULL; + + if (mlx5_fs_bulk_init(dev, &pr_bulk->fs_bulk, bulk_len)) + goto free_pr_bulk; + + for (i = 0; i < bulk_len; i++) { + pr_bulk->prs_data[i].bulk = pr_bulk; + pr_bulk->prs_data[i].offset = i; + } + + pr_bulk->hws_action = mlx5_fs_pr_bulk_action_create(dev, pr_pool_ctx); + if (!pr_bulk->hws_action) + goto cleanup_fs_bulk; + + return &pr_bulk->fs_bulk; + +cleanup_fs_bulk: + mlx5_fs_bulk_cleanup(&pr_bulk->fs_bulk); +free_pr_bulk: + kvfree(pr_bulk); + return NULL; +} + +static int +mlx5_fs_hws_pr_bulk_destroy(struct mlx5_core_dev *dev, struct mlx5_fs_bulk *fs_bulk) +{ + struct mlx5_fs_hws_pr_bulk *pr_bulk; + + pr_bulk = container_of(fs_bulk, struct mlx5_fs_hws_pr_bulk, fs_bulk); + if (mlx5_fs_bulk_get_free_amount(fs_bulk) < fs_bulk->bulk_len) { + mlx5_core_err(dev, "Freeing bulk before all reformats were released\n"); + return -EBUSY; + } + + mlx5hws_action_destroy(pr_bulk->hws_action); + mlx5_fs_bulk_cleanup(fs_bulk); + kvfree(pr_bulk); + + return 0; +} + +static void mlx5_hws_pool_update_threshold(struct mlx5_fs_pool *hws_pool) +{ + hws_pool->threshold = min_t(int, MLX5_FS_HWS_POOL_MAX_THRESHOLD, + hws_pool->used_units / MLX5_FS_HWS_POOL_USED_BUFF_RATIO); +} + +static const struct mlx5_fs_pool_ops mlx5_fs_hws_pr_pool_ops = { + .bulk_create = mlx5_fs_hws_pr_bulk_create, + .bulk_destroy = mlx5_fs_hws_pr_bulk_destroy, + .update_threshold = mlx5_hws_pool_update_threshold, +}; + +int mlx5_fs_hws_pr_pool_init(struct mlx5_fs_pool *pr_pool, + struct mlx5_core_dev *dev, size_t encap_data_size, + enum mlx5hws_action_type reformat_type) +{ + struct mlx5_fs_hws_pr_pool_ctx *pr_pool_ctx; + + if (reformat_type != MLX5HWS_ACTION_TYP_INSERT_HEADER && + reformat_type != MLX5HWS_ACTION_TYP_REFORMAT_TNL_L3_TO_L2 && + reformat_type != MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L3 && + reformat_type != MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L2) + return -EOPNOTSUPP; + + pr_pool_ctx = kzalloc(sizeof(*pr_pool_ctx), GFP_KERNEL); + if (!pr_pool_ctx) + return -ENOMEM; + pr_pool_ctx->reformat_type = reformat_type; + pr_pool_ctx->encap_data_size = encap_data_size; + mlx5_fs_pool_init(pr_pool, dev, &mlx5_fs_hws_pr_pool_ops, pr_pool_ctx); + return 0; +} + +void mlx5_fs_hws_pr_pool_cleanup(struct mlx5_fs_pool *pr_pool) +{ + struct mlx5_fs_hws_pr_pool_ctx *pr_pool_ctx; + + mlx5_fs_pool_cleanup(pr_pool); + pr_pool_ctx = pr_pool->pool_ctx; + if (!pr_pool_ctx) + return; + kfree(pr_pool_ctx); +} + +struct mlx5_fs_hws_pr * +mlx5_fs_hws_pr_pool_acquire_pr(struct mlx5_fs_pool *pr_pool) +{ + struct mlx5_fs_pool_index pool_index = {}; + struct mlx5_fs_hws_pr_bulk *pr_bulk; + int err; + + err = mlx5_fs_pool_acquire_index(pr_pool, &pool_index); + if (err) + return ERR_PTR(err); + pr_bulk = container_of(pool_index.fs_bulk, struct mlx5_fs_hws_pr_bulk, + fs_bulk); + return &pr_bulk->prs_data[pool_index.index]; +} + +void mlx5_fs_hws_pr_pool_release_pr(struct mlx5_fs_pool *pr_pool, + struct mlx5_fs_hws_pr *pr_data) +{ + struct mlx5_fs_bulk *fs_bulk = &pr_data->bulk->fs_bulk; + struct mlx5_fs_pool_index pool_index = {}; + struct mlx5_core_dev *dev = pr_pool->dev; + + pool_index.fs_bulk = fs_bulk; + pool_index.index = pr_data->offset; + if (mlx5_fs_pool_release_index(pr_pool, &pool_index)) + mlx5_core_warn(dev, "Attempted to release packet reformat which is not acquired\n"); +} + +struct mlx5hws_action *mlx5_fs_hws_pr_get_action(struct mlx5_fs_hws_pr *pr_data) +{ + return pr_data->bulk->hws_action; +} + +static struct mlx5hws_action * +mlx5_fs_mh_bulk_action_create(struct mlx5hws_context *ctx, + struct mlx5hws_action_mh_pattern *pattern) +{ + u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB; + u32 log_bulk_size; + + log_bulk_size = ilog2(MLX5_FS_HWS_DEFAULT_BULK_LEN); + return mlx5hws_action_create_modify_header(ctx, 1, pattern, + log_bulk_size, flags); +} + +static struct mlx5_fs_bulk * +mlx5_fs_hws_mh_bulk_create(struct mlx5_core_dev *dev, void *pool_ctx) +{ + struct mlx5hws_action_mh_pattern *pattern; + struct mlx5_flow_root_namespace *root_ns; + struct mlx5_fs_hws_mh_bulk *mh_bulk; + struct mlx5hws_context *ctx; + int bulk_len; + + if (!pool_ctx) + return NULL; + + root_ns = mlx5_get_root_namespace(dev, MLX5_FLOW_NAMESPACE_FDB); + if (!root_ns || root_ns->mode != MLX5_FLOW_STEERING_MODE_HMFS) + return NULL; + + ctx = root_ns->fs_hws_context.hws_ctx; + if (!ctx) + return NULL; + + pattern = pool_ctx; + bulk_len = MLX5_FS_HWS_DEFAULT_BULK_LEN; + mh_bulk = kvzalloc(struct_size(mh_bulk, mhs_data, bulk_len), GFP_KERNEL); + if (!mh_bulk) + return NULL; + + if (mlx5_fs_bulk_init(dev, &mh_bulk->fs_bulk, bulk_len)) + goto free_mh_bulk; + + for (int i = 0; i < bulk_len; i++) { + mh_bulk->mhs_data[i].bulk = mh_bulk; + mh_bulk->mhs_data[i].offset = i; + } + + mh_bulk->hws_action = mlx5_fs_mh_bulk_action_create(ctx, pattern); + if (!mh_bulk->hws_action) + goto cleanup_fs_bulk; + + return &mh_bulk->fs_bulk; + +cleanup_fs_bulk: + mlx5_fs_bulk_cleanup(&mh_bulk->fs_bulk); +free_mh_bulk: + kvfree(mh_bulk); + return NULL; +} + +static int +mlx5_fs_hws_mh_bulk_destroy(struct mlx5_core_dev *dev, + struct mlx5_fs_bulk *fs_bulk) +{ + struct mlx5_fs_hws_mh_bulk *mh_bulk; + + mh_bulk = container_of(fs_bulk, struct mlx5_fs_hws_mh_bulk, fs_bulk); + if (mlx5_fs_bulk_get_free_amount(fs_bulk) < fs_bulk->bulk_len) { + mlx5_core_err(dev, "Freeing bulk before all modify header were released\n"); + return -EBUSY; + } + + mlx5hws_action_destroy(mh_bulk->hws_action); + mlx5_fs_bulk_cleanup(fs_bulk); + kvfree(mh_bulk); + + return 0; +} + +static const struct mlx5_fs_pool_ops mlx5_fs_hws_mh_pool_ops = { + .bulk_create = mlx5_fs_hws_mh_bulk_create, + .bulk_destroy = mlx5_fs_hws_mh_bulk_destroy, + .update_threshold = mlx5_hws_pool_update_threshold, +}; + +int mlx5_fs_hws_mh_pool_init(struct mlx5_fs_pool *fs_hws_mh_pool, + struct mlx5_core_dev *dev, + struct mlx5hws_action_mh_pattern *pattern) +{ + struct mlx5hws_action_mh_pattern *pool_pattern; + + pool_pattern = kzalloc(sizeof(*pool_pattern), GFP_KERNEL); + if (!pool_pattern) + return -ENOMEM; + pool_pattern->data = kmemdup(pattern->data, pattern->sz, GFP_KERNEL); + if (!pool_pattern->data) { + kfree(pool_pattern); + return -ENOMEM; + } + pool_pattern->sz = pattern->sz; + mlx5_fs_pool_init(fs_hws_mh_pool, dev, &mlx5_fs_hws_mh_pool_ops, + pool_pattern); + return 0; +} + +void mlx5_fs_hws_mh_pool_cleanup(struct mlx5_fs_pool *fs_hws_mh_pool) +{ + struct mlx5hws_action_mh_pattern *pool_pattern; + + mlx5_fs_pool_cleanup(fs_hws_mh_pool); + pool_pattern = fs_hws_mh_pool->pool_ctx; + if (!pool_pattern) + return; + kfree(pool_pattern->data); + kfree(pool_pattern); +} + +struct mlx5_fs_hws_mh * +mlx5_fs_hws_mh_pool_acquire_mh(struct mlx5_fs_pool *mh_pool) +{ + struct mlx5_fs_pool_index pool_index = {}; + struct mlx5_fs_hws_mh_bulk *mh_bulk; + int err; + + err = mlx5_fs_pool_acquire_index(mh_pool, &pool_index); + if (err) + return ERR_PTR(err); + mh_bulk = container_of(pool_index.fs_bulk, struct mlx5_fs_hws_mh_bulk, + fs_bulk); + return &mh_bulk->mhs_data[pool_index.index]; +} + +void mlx5_fs_hws_mh_pool_release_mh(struct mlx5_fs_pool *mh_pool, + struct mlx5_fs_hws_mh *mh_data) +{ + struct mlx5_fs_bulk *fs_bulk = &mh_data->bulk->fs_bulk; + struct mlx5_fs_pool_index pool_index = {}; + struct mlx5_core_dev *dev = mh_pool->dev; + + pool_index.fs_bulk = fs_bulk; + pool_index.index = mh_data->offset; + if (mlx5_fs_pool_release_index(mh_pool, &pool_index)) + mlx5_core_warn(dev, "Attempted to release modify header which is not acquired\n"); +} + +bool mlx5_fs_hws_mh_pool_match(struct mlx5_fs_pool *mh_pool, + struct mlx5hws_action_mh_pattern *pattern) +{ + struct mlx5hws_action_mh_pattern *pool_pattern; + int num_actions, i; + + pool_pattern = mh_pool->pool_ctx; + if (WARN_ON_ONCE(!pool_pattern)) + return false; + + if (pattern->sz != pool_pattern->sz) + return false; + num_actions = pattern->sz / MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto); + for (i = 0; i < num_actions; i++) { + if ((__force __be32)pattern->data[i] != + (__force __be32)pool_pattern->data[i]) + return false; + } + return true; +} + +struct mlx5hws_action *mlx5_fc_get_hws_action(struct mlx5hws_context *ctx, + struct mlx5_fc *counter) +{ + u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED; + struct mlx5_fc_bulk *fc_bulk = counter->bulk; + struct mlx5_fc_bulk_hws_data *fc_bulk_hws; + + fc_bulk_hws = &fc_bulk->hws_data; + /* try avoid locking if not necessary */ + if (refcount_inc_not_zero(&fc_bulk_hws->hws_action_refcount)) + return fc_bulk_hws->hws_action; + + mutex_lock(&fc_bulk_hws->lock); + if (refcount_inc_not_zero(&fc_bulk_hws->hws_action_refcount)) { + mutex_unlock(&fc_bulk_hws->lock); + return fc_bulk_hws->hws_action; + } + fc_bulk_hws->hws_action = + mlx5hws_action_create_counter(ctx, fc_bulk->base_id, flags); + if (!fc_bulk_hws->hws_action) { + mutex_unlock(&fc_bulk_hws->lock); + return NULL; + } + refcount_set(&fc_bulk_hws->hws_action_refcount, 1); + mutex_unlock(&fc_bulk_hws->lock); + + return fc_bulk_hws->hws_action; +} + +void mlx5_fc_put_hws_action(struct mlx5_fc *counter) +{ + struct mlx5_fc_bulk_hws_data *fc_bulk_hws = &counter->bulk->hws_data; + + /* try avoid locking if not necessary */ + if (refcount_dec_not_one(&fc_bulk_hws->hws_action_refcount)) + return; + + mutex_lock(&fc_bulk_hws->lock); + if (!refcount_dec_and_test(&fc_bulk_hws->hws_action_refcount)) { + mutex_unlock(&fc_bulk_hws->lock); + return; + } + mlx5hws_action_destroy(fc_bulk_hws->hws_action); + fc_bulk_hws->hws_action = NULL; + mutex_unlock(&fc_bulk_hws->lock); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.h new file mode 100644 index 000000000000..34072551dd21 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2025 NVIDIA Corporation & Affiliates */ + +#ifndef __MLX5_FS_HWS_POOLS_H__ +#define __MLX5_FS_HWS_POOLS_H__ + +#include <linux/if_vlan.h> +#include "fs_pool.h" +#include "fs_core.h" + +#define MLX5_FS_INSERT_HDR_VLAN_ANCHOR MLX5_REFORMAT_CONTEXT_ANCHOR_MAC_START +#define MLX5_FS_INSERT_HDR_VLAN_OFFSET offsetof(struct vlan_ethhdr, h_vlan_proto) +#define MLX5_FS_INSERT_HDR_VLAN_SIZE sizeof(struct vlan_hdr) + +enum { + MLX5_FS_DL3TNLTOL2_MAC_HDR_IDX = 0, + MLX5_FS_DL3TNLTOL2_MAC_VLAN_HDR_IDX, +}; + +struct mlx5_fs_hws_pr { + struct mlx5_fs_hws_pr_bulk *bulk; + u32 offset; + u8 hdr_idx; + u8 *data; + size_t data_size; +}; + +struct mlx5_fs_hws_pr_bulk { + struct mlx5_fs_bulk fs_bulk; + struct mlx5hws_action *hws_action; + struct mlx5_fs_hws_pr prs_data[]; +}; + +struct mlx5_fs_hws_pr_pool_ctx { + enum mlx5hws_action_type reformat_type; + size_t encap_data_size; +}; + +struct mlx5_fs_hws_mh { + struct mlx5_fs_hws_mh_bulk *bulk; + u32 offset; + u8 *data; +}; + +struct mlx5_fs_hws_mh_bulk { + struct mlx5_fs_bulk fs_bulk; + struct mlx5_fs_pool *mh_pool; + struct mlx5hws_action *hws_action; + struct mlx5_fs_hws_mh mhs_data[]; +}; + +int mlx5_fs_hws_pr_pool_init(struct mlx5_fs_pool *pr_pool, + struct mlx5_core_dev *dev, size_t encap_data_size, + enum mlx5hws_action_type reformat_type); +void mlx5_fs_hws_pr_pool_cleanup(struct mlx5_fs_pool *pr_pool); + +struct mlx5_fs_hws_pr *mlx5_fs_hws_pr_pool_acquire_pr(struct mlx5_fs_pool *pr_pool); +void mlx5_fs_hws_pr_pool_release_pr(struct mlx5_fs_pool *pr_pool, + struct mlx5_fs_hws_pr *pr_data); +struct mlx5hws_action *mlx5_fs_hws_pr_get_action(struct mlx5_fs_hws_pr *pr_data); +int mlx5_fs_hws_mh_pool_init(struct mlx5_fs_pool *fs_hws_mh_pool, + struct mlx5_core_dev *dev, + struct mlx5hws_action_mh_pattern *pattern); +void mlx5_fs_hws_mh_pool_cleanup(struct mlx5_fs_pool *fs_hws_mh_pool); +struct mlx5_fs_hws_mh *mlx5_fs_hws_mh_pool_acquire_mh(struct mlx5_fs_pool *mh_pool); +void mlx5_fs_hws_mh_pool_release_mh(struct mlx5_fs_pool *mh_pool, + struct mlx5_fs_hws_mh *mh_data); +bool mlx5_fs_hws_mh_pool_match(struct mlx5_fs_pool *mh_pool, + struct mlx5hws_action_mh_pattern *pattern); +struct mlx5hws_action *mlx5_fc_get_hws_action(struct mlx5hws_context *ctx, + struct mlx5_fc *counter); +void mlx5_fc_put_hws_action(struct mlx5_fc *counter); +#endif /* __MLX5_FS_HWS_POOLS_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h index 3c8635f286ce..30ccd635b505 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h @@ -39,7 +39,6 @@ #define mlx5hws_dbg(ctx, arg...) mlx5_core_dbg((ctx)->mdev, ##arg) #define MLX5HWS_TABLE_TYPE_BASE 2 -#define MLX5HWS_ACTION_STE_IDX_ANY 0 static inline bool is_mem_zero(const u8 *mem, size_t size) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c index 1bb3a6f8c3cd..80157a29a076 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c @@ -165,14 +165,14 @@ static int hws_matcher_disconnect(struct mlx5hws_matcher *matcher) next->match_ste.rtc_0_id, next->match_ste.rtc_1_id); if (ret) { - mlx5hws_err(tbl->ctx, "Failed to disconnect matcher\n"); - goto matcher_reconnect; + mlx5hws_err(tbl->ctx, "Fatal error, failed to disconnect matcher\n"); + return ret; } } else { ret = mlx5hws_table_connect_to_miss_table(tbl, tbl->default_miss.miss_tbl); if (ret) { - mlx5hws_err(tbl->ctx, "Failed to disconnect last matcher\n"); - goto matcher_reconnect; + mlx5hws_err(tbl->ctx, "Fatal error, failed to disconnect last matcher\n"); + return ret; } } @@ -180,27 +180,19 @@ static int hws_matcher_disconnect(struct mlx5hws_matcher *matcher) if (prev_ft_id == tbl->ft_id) { ret = mlx5hws_table_update_connected_miss_tables(tbl); if (ret) { - mlx5hws_err(tbl->ctx, "Fatal error, failed to update connected miss table\n"); - goto matcher_reconnect; + mlx5hws_err(tbl->ctx, + "Fatal error, failed to update connected miss table\n"); + return ret; } } ret = mlx5hws_table_ft_set_default_next_ft(tbl, prev_ft_id); if (ret) { mlx5hws_err(tbl->ctx, "Fatal error, failed to restore matcher ft default miss\n"); - goto matcher_reconnect; + return ret; } return 0; - -matcher_reconnect: - if (list_empty(&tbl->matchers_list) || !prev) - list_add(&matcher->list_node, &tbl->matchers_list); - else - /* insert after prev matcher */ - list_add(&matcher->list_node, &prev->list_node); - - return ret; } static void hws_matcher_set_rtc_attr_sz(struct mlx5hws_matcher *matcher, @@ -208,7 +200,7 @@ static void hws_matcher_set_rtc_attr_sz(struct mlx5hws_matcher *matcher, enum mlx5hws_matcher_rtc_type rtc_type, bool is_mirror) { - struct mlx5hws_pool_chunk *ste = &matcher->action_ste[MLX5HWS_ACTION_STE_IDX_ANY].ste; + struct mlx5hws_pool_chunk *ste = &matcher->action_ste.ste; enum mlx5hws_matcher_flow_src flow_src = matcher->attr.optimize_flow_src; bool is_match_rtc = rtc_type == HWS_MATCHER_RTC_TYPE_MATCH; @@ -225,8 +217,7 @@ static void hws_matcher_set_rtc_attr_sz(struct mlx5hws_matcher *matcher, } static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, - enum mlx5hws_matcher_rtc_type rtc_type, - u8 action_ste_selector) + enum mlx5hws_matcher_rtc_type rtc_type) { struct mlx5hws_matcher_attr *attr = &matcher->attr; struct mlx5hws_cmd_rtc_create_attr rtc_attr = {0}; @@ -286,14 +277,19 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, break; case HWS_MATCHER_RTC_TYPE_STE_ARRAY: - action_ste = &matcher->action_ste[action_ste_selector]; + action_ste = &matcher->action_ste; rtc_0_id = &action_ste->rtc_0_id; rtc_1_id = &action_ste->rtc_1_id; ste_pool = action_ste->pool; ste = &action_ste->ste; + /* Action RTC size calculation: + * log((max number of rules in matcher) * + * (max number of action STEs per rule) * + * (2 to support writing new STEs for update rule)) + */ ste->order = ilog2(roundup_pow_of_two(action_ste->max_stes)) + - attr->table.sz_row_log; + attr->table.sz_row_log + 1; rtc_attr.log_size = ste->order; rtc_attr.log_depth = 0; rtc_attr.update_index_mode = MLX5_IFC_RTC_STE_UPDATE_MODE_BY_OFFSET; @@ -318,8 +314,8 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, rtc_type, false); /* STC is a single resource (obj_id), use any STC for the ID */ - stc_pool = ctx->stc_pool[tbl->type]; - default_stc = ctx->common_res[tbl->type].default_stc; + stc_pool = ctx->stc_pool; + default_stc = ctx->common_res.default_stc; obj_id = mlx5hws_pool_chunk_get_base_id(stc_pool, &default_stc->default_hit); rtc_attr.stc_base = obj_id; @@ -358,8 +354,7 @@ free_ste: } static void hws_matcher_destroy_rtc(struct mlx5hws_matcher *matcher, - enum mlx5hws_matcher_rtc_type rtc_type, - u8 action_ste_selector) + enum mlx5hws_matcher_rtc_type rtc_type) { struct mlx5hws_matcher_action_ste *action_ste; struct mlx5hws_table *tbl = matcher->tbl; @@ -375,7 +370,7 @@ static void hws_matcher_destroy_rtc(struct mlx5hws_matcher *matcher, ste = &matcher->match_ste.ste; break; case HWS_MATCHER_RTC_TYPE_STE_ARRAY: - action_ste = &matcher->action_ste[action_ste_selector]; + action_ste = &matcher->action_ste; rtc_0_id = action_ste->rtc_0_id; rtc_1_id = action_ste->rtc_1_id; ste_pool = action_ste->pool; @@ -466,20 +461,13 @@ static int hws_matcher_resize_init(struct mlx5hws_matcher *src_matcher) if (!resize_data) return -ENOMEM; - resize_data->max_stes = src_matcher->action_ste[MLX5HWS_ACTION_STE_IDX_ANY].max_stes; - - resize_data->action_ste[0].stc = src_matcher->action_ste[0].stc; - resize_data->action_ste[0].rtc_0_id = src_matcher->action_ste[0].rtc_0_id; - resize_data->action_ste[0].rtc_1_id = src_matcher->action_ste[0].rtc_1_id; - resize_data->action_ste[0].pool = src_matcher->action_ste[0].max_stes ? - src_matcher->action_ste[0].pool : - NULL; - resize_data->action_ste[1].stc = src_matcher->action_ste[1].stc; - resize_data->action_ste[1].rtc_0_id = src_matcher->action_ste[1].rtc_0_id; - resize_data->action_ste[1].rtc_1_id = src_matcher->action_ste[1].rtc_1_id; - resize_data->action_ste[1].pool = src_matcher->action_ste[1].max_stes ? - src_matcher->action_ste[1].pool : - NULL; + resize_data->max_stes = src_matcher->action_ste.max_stes; + + resize_data->stc = src_matcher->action_ste.stc; + resize_data->rtc_0_id = src_matcher->action_ste.rtc_0_id; + resize_data->rtc_1_id = src_matcher->action_ste.rtc_1_id; + resize_data->pool = src_matcher->action_ste.max_stes ? + src_matcher->action_ste.pool : NULL; /* Place the new resized matcher on the dst matcher's list */ list_add(&resize_data->list_node, &src_matcher->resize_dst->resize_data); @@ -512,49 +500,68 @@ static void hws_matcher_resize_uninit(struct mlx5hws_matcher *matcher) if (resize_data->max_stes) { mlx5hws_action_free_single_stc(matcher->tbl->ctx, matcher->tbl->type, - &resize_data->action_ste[1].stc); - mlx5hws_action_free_single_stc(matcher->tbl->ctx, - matcher->tbl->type, - &resize_data->action_ste[0].stc); + &resize_data->stc); - if (matcher->tbl->type == MLX5HWS_TABLE_TYPE_FDB) { + if (matcher->tbl->type == MLX5HWS_TABLE_TYPE_FDB) mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev, - resize_data->action_ste[1].rtc_1_id); - mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev, - resize_data->action_ste[0].rtc_1_id); - } - mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev, - resize_data->action_ste[1].rtc_0_id); + resize_data->rtc_1_id); + mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev, - resize_data->action_ste[0].rtc_0_id); - if (resize_data->action_ste[MLX5HWS_ACTION_STE_IDX_ANY].pool) { - mlx5hws_pool_destroy(resize_data->action_ste[1].pool); - mlx5hws_pool_destroy(resize_data->action_ste[0].pool); - } + resize_data->rtc_0_id); + + if (resize_data->pool) + mlx5hws_pool_destroy(resize_data->pool); } kfree(resize_data); } } -static int -hws_matcher_bind_at_idx(struct mlx5hws_matcher *matcher, u8 action_ste_selector) +static int hws_matcher_bind_at(struct mlx5hws_matcher *matcher) { + bool is_jumbo = mlx5hws_matcher_mt_is_jumbo(matcher->mt); struct mlx5hws_cmd_stc_modify_attr stc_attr = {0}; struct mlx5hws_matcher_action_ste *action_ste; struct mlx5hws_table *tbl = matcher->tbl; struct mlx5hws_pool_attr pool_attr = {0}; struct mlx5hws_context *ctx = tbl->ctx; - int ret; + u32 required_stes; + u8 max_stes = 0; + int i, ret; + + if (matcher->flags & MLX5HWS_MATCHER_FLAGS_COLLISION) + return 0; + + for (i = 0; i < matcher->num_of_at; i++) { + struct mlx5hws_action_template *at = &matcher->at[i]; + + ret = hws_matcher_check_and_process_at(matcher, at); + if (ret) { + mlx5hws_err(ctx, "Invalid at %d", i); + return ret; + } + + required_stes = at->num_of_action_stes - (!is_jumbo || at->only_term); + max_stes = max(max_stes, required_stes); + + /* Future: Optimize reparse */ + } + + /* There are no additional STEs required for matcher */ + if (!max_stes) + return 0; + + matcher->action_ste.max_stes = max_stes; - action_ste = &matcher->action_ste[action_ste_selector]; + action_ste = &matcher->action_ste; /* Allocate action STE mempool */ pool_attr.table_type = tbl->type; pool_attr.pool_type = MLX5HWS_POOL_TYPE_STE; pool_attr.flags = MLX5HWS_POOL_FLAGS_FOR_STE_ACTION_POOL; + /* Pool size is similar to action RTC size */ pool_attr.alloc_log_sz = ilog2(roundup_pow_of_two(action_ste->max_stes)) + - matcher->attr.table.sz_row_log; + matcher->attr.table.sz_row_log + 1; hws_matcher_set_pool_attr(&pool_attr, matcher); action_ste->pool = mlx5hws_pool_create(ctx, &pool_attr); if (!action_ste->pool) { @@ -563,7 +570,7 @@ hws_matcher_bind_at_idx(struct mlx5hws_matcher *matcher, u8 action_ste_selector) } /* Allocate action RTC */ - ret = hws_matcher_create_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY, action_ste_selector); + ret = hws_matcher_create_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY); if (ret) { mlx5hws_err(ctx, "Failed to create action RTC\n"); goto free_ste_pool; @@ -587,18 +594,18 @@ hws_matcher_bind_at_idx(struct mlx5hws_matcher *matcher, u8 action_ste_selector) return 0; free_rtc: - hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY, action_ste_selector); + hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY); free_ste_pool: mlx5hws_pool_destroy(action_ste->pool); return ret; } -static void hws_matcher_unbind_at_idx(struct mlx5hws_matcher *matcher, u8 action_ste_selector) +static void hws_matcher_unbind_at(struct mlx5hws_matcher *matcher) { struct mlx5hws_matcher_action_ste *action_ste; struct mlx5hws_table *tbl = matcher->tbl; - action_ste = &matcher->action_ste[action_ste_selector]; + action_ste = &matcher->action_ste; if (!action_ste->max_stes || matcher->flags & MLX5HWS_MATCHER_FLAGS_COLLISION || @@ -606,65 +613,10 @@ static void hws_matcher_unbind_at_idx(struct mlx5hws_matcher *matcher, u8 action return; mlx5hws_action_free_single_stc(tbl->ctx, tbl->type, &action_ste->stc); - hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY, action_ste_selector); + hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY); mlx5hws_pool_destroy(action_ste->pool); } -static int hws_matcher_bind_at(struct mlx5hws_matcher *matcher) -{ - bool is_jumbo = mlx5hws_matcher_mt_is_jumbo(matcher->mt); - struct mlx5hws_table *tbl = matcher->tbl; - struct mlx5hws_context *ctx = tbl->ctx; - u32 required_stes; - u8 max_stes = 0; - int i, ret; - - if (matcher->flags & MLX5HWS_MATCHER_FLAGS_COLLISION) - return 0; - - for (i = 0; i < matcher->num_of_at; i++) { - struct mlx5hws_action_template *at = &matcher->at[i]; - - ret = hws_matcher_check_and_process_at(matcher, at); - if (ret) { - mlx5hws_err(ctx, "Invalid at %d", i); - return ret; - } - - required_stes = at->num_of_action_stes - (!is_jumbo || at->only_term); - max_stes = max(max_stes, required_stes); - - /* Future: Optimize reparse */ - } - - /* There are no additional STEs required for matcher */ - if (!max_stes) - return 0; - - matcher->action_ste[0].max_stes = max_stes; - matcher->action_ste[1].max_stes = max_stes; - - ret = hws_matcher_bind_at_idx(matcher, 0); - if (ret) - return ret; - - ret = hws_matcher_bind_at_idx(matcher, 1); - if (ret) - goto free_at_0; - - return 0; - -free_at_0: - hws_matcher_unbind_at_idx(matcher, 0); - return ret; -} - -static void hws_matcher_unbind_at(struct mlx5hws_matcher *matcher) -{ - hws_matcher_unbind_at_idx(matcher, 1); - hws_matcher_unbind_at_idx(matcher, 0); -} - static int hws_matcher_bind_mt(struct mlx5hws_matcher *matcher) { struct mlx5hws_context *ctx = matcher->tbl->ctx; @@ -810,7 +762,7 @@ static int hws_matcher_create_and_connect(struct mlx5hws_matcher *matcher) goto unbind_at; /* Allocate the RTC for the new matcher */ - ret = hws_matcher_create_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH, 0); + ret = hws_matcher_create_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH); if (ret) goto destroy_end_ft; @@ -822,7 +774,7 @@ static int hws_matcher_create_and_connect(struct mlx5hws_matcher *matcher) return 0; destroy_rtc: - hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH, 0); + hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH); destroy_end_ft: hws_matcher_destroy_end_ft(matcher); unbind_at: @@ -836,7 +788,7 @@ static void hws_matcher_destroy_and_disconnect(struct mlx5hws_matcher *matcher) { hws_matcher_resize_uninit(matcher); hws_matcher_disconnect(matcher); - hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH, 0); + hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH); hws_matcher_destroy_end_ft(matcher); hws_matcher_unbind_at(matcher); hws_matcher_unbind_mt(matcher); @@ -970,10 +922,9 @@ int mlx5hws_matcher_attach_at(struct mlx5hws_matcher *matcher, return ret; required_stes = at->num_of_action_stes - (!is_jumbo || at->only_term); - if (matcher->action_ste[MLX5HWS_ACTION_STE_IDX_ANY].max_stes < required_stes) { + if (matcher->action_ste.max_stes < required_stes) { mlx5hws_dbg(ctx, "Required STEs [%d] exceeds initial action template STE [%d]\n", - required_stes, - matcher->action_ste[MLX5HWS_ACTION_STE_IDX_ANY].max_stes); + required_stes, matcher->action_ste.max_stes); return -ENOMEM; } @@ -1007,9 +958,9 @@ hws_matcher_set_templates(struct mlx5hws_matcher *matcher, if (!matcher->mt) return -ENOMEM; - matcher->at = kcalloc(num_of_at + matcher->attr.max_num_of_at_attach, - sizeof(*matcher->at), - GFP_KERNEL); + matcher->at = kvcalloc(num_of_at + matcher->attr.max_num_of_at_attach, + sizeof(*matcher->at), + GFP_KERNEL); if (!matcher->at) { mlx5hws_err(ctx, "Failed to allocate action template array\n"); ret = -ENOMEM; @@ -1035,7 +986,7 @@ free_mt: static void hws_matcher_unset_templates(struct mlx5hws_matcher *matcher) { - kfree(matcher->at); + kvfree(matcher->at); kfree(matcher->mt); } @@ -1157,8 +1108,7 @@ static int hws_matcher_resize_precheck(struct mlx5hws_matcher *src_matcher, return -EINVAL; } - if (src_matcher->action_ste[MLX5HWS_ACTION_STE_IDX_ANY].max_stes > - dst_matcher->action_ste[0].max_stes) { + if (src_matcher->action_ste.max_stes > dst_matcher->action_ste.max_stes) { mlx5hws_err(ctx, "Src/dst matcher max STEs mismatch\n"); return -EINVAL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h index 81ff487f57be..cff4ae854a79 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h @@ -52,15 +52,11 @@ struct mlx5hws_matcher_action_ste { u8 max_stes; }; -struct mlx5hws_matcher_resize_data_node { +struct mlx5hws_matcher_resize_data { struct mlx5hws_pool_chunk stc; u32 rtc_0_id; u32 rtc_1_id; struct mlx5hws_pool *pool; -}; - -struct mlx5hws_matcher_resize_data { - struct mlx5hws_matcher_resize_data_node action_ste[2]; u8 max_stes; struct list_head list_node; }; @@ -78,7 +74,7 @@ struct mlx5hws_matcher { struct mlx5hws_matcher *col_matcher; struct mlx5hws_matcher *resize_dst; struct mlx5hws_matcher_match_ste match_ste; - struct mlx5hws_matcher_action_ste action_ste[2]; + struct mlx5hws_matcher_action_ste action_ste; struct list_head list_node; struct list_head resize_data; }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c index 06db5e4726ae..d9dc4f2d0dc6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c @@ -344,7 +344,7 @@ void mlx5hws_arg_write(struct mlx5hws_send_engine *queue, mlx5hws_send_engine_post_req_wqe(&ctrl, (void *)&wqe_ctrl, &wqe_len); memset(wqe_ctrl, 0, wqe_len); mlx5hws_send_engine_post_req_wqe(&ctrl, (void *)&wqe_arg, &wqe_len); - memcpy(wqe_arg, arg_data, wqe_len); + memcpy(wqe_arg, arg_data, MLX5HWS_ARG_DATA_SIZE); send_attr.id = arg_idx++; mlx5hws_send_engine_post_end(&ctrl, &send_attr); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h index 27ca93385b08..8ddb51980044 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h @@ -31,7 +31,7 @@ struct mlx5hws_pattern_cache_item { u8 *data; u16 num_of_actions; } mh_data; - u32 refcount; + u32 refcount; /* protected by pattern_cache lock */ struct list_head ptrn_list_node; }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c index fed2d913f3b8..50a81d360bb2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c @@ -183,7 +183,7 @@ static int hws_pool_buddy_get_mem_chunk(struct mlx5hws_pool *pool, *seg = -1; /* Find the next free place from the buddy array */ - while (*seg == -1) { + while (*seg < 0) { for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++) { buddy = hws_pool_buddy_get_next_buddy(pool, i, order, @@ -194,7 +194,7 @@ static int hws_pool_buddy_get_mem_chunk(struct mlx5hws_pool *pool, } *seg = mlx5hws_buddy_alloc_mem(buddy, order); - if (*seg != -1) + if (*seg >= 0) goto found; if (pool->flags & MLX5HWS_POOL_FLAGS_ONE_RESOURCE) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/prm.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/prm.h index de92cecbeb92..271490a51b96 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/prm.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/prm.h @@ -390,11 +390,6 @@ struct mlx5_ifc_definer_bits { u8 match_mask[0x160]; }; -struct mlx5_ifc_arg_bits { - u8 rsvd0[0x88]; - u8 access_pd[0x18]; -}; - struct mlx5_ifc_header_modify_pattern_in_bits { u8 modify_field_select[0x40]; @@ -428,11 +423,6 @@ struct mlx5_ifc_create_definer_in_bits { struct mlx5_ifc_definer_bits definer; }; -struct mlx5_ifc_create_arg_in_bits { - struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; - struct mlx5_ifc_arg_bits arg; -}; - struct mlx5_ifc_create_header_modify_pattern_in_bits { struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; struct mlx5_ifc_header_modify_pattern_in_bits pattern; @@ -479,36 +469,4 @@ enum { MLX5_IFC_MODIFY_FLOW_TABLE_MISS_ACTION_GOTO_TBL = 1, }; -struct mlx5_ifc_alloc_packet_reformat_out_bits { - u8 status[0x8]; - u8 reserved_at_8[0x18]; - - u8 syndrome[0x20]; - - u8 packet_reformat_id[0x20]; - - u8 reserved_at_60[0x20]; -}; - -struct mlx5_ifc_dealloc_packet_reformat_in_bits { - u8 opcode[0x10]; - u8 reserved_at_10[0x10]; - - u8 reserved_at_20[0x10]; - u8 op_mod[0x10]; - - u8 packet_reformat_id[0x20]; - - u8 reserved_at_60[0x20]; -}; - -struct mlx5_ifc_dealloc_packet_reformat_out_bits { - u8 status[0x8]; - u8 reserved_at_8[0x18]; - - u8 syndrome[0x20]; - - u8 reserved_at_40[0x40]; -}; - #endif /* MLX5_PRM_H_ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c index e20c67a04203..a27a2d5ffc7b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c @@ -129,27 +129,18 @@ static void hws_rule_gen_comp(struct mlx5hws_send_engine *queue, static void hws_rule_save_resize_info(struct mlx5hws_rule *rule, - struct mlx5hws_send_ste_attr *ste_attr, - bool is_update) + struct mlx5hws_send_ste_attr *ste_attr) { if (!mlx5hws_matcher_is_resizable(rule->matcher)) return; - if (likely(!is_update)) { + /* resize_info might already exist (if we're in update flow) */ + if (likely(!rule->resize_info)) { rule->resize_info = kzalloc(sizeof(*rule->resize_info), GFP_KERNEL); if (unlikely(!rule->resize_info)) { pr_warn("HWS: resize info isn't allocated for rule\n"); return; } - - rule->resize_info->max_stes = - rule->matcher->action_ste[MLX5HWS_ACTION_STE_IDX_ANY].max_stes; - rule->resize_info->action_ste_pool[0] = rule->matcher->action_ste[0].max_stes ? - rule->matcher->action_ste[0].pool : - NULL; - rule->resize_info->action_ste_pool[1] = rule->matcher->action_ste[1].max_stes ? - rule->matcher->action_ste[1].pool : - NULL; } memcpy(rule->resize_info->ctrl_seg, ste_attr->wqe_ctrl, @@ -204,15 +195,14 @@ hws_rule_load_delete_info(struct mlx5hws_rule *rule, } } -static int hws_rule_alloc_action_ste_idx(struct mlx5hws_rule *rule, - u8 action_ste_selector) +static int hws_rule_alloc_action_ste(struct mlx5hws_rule *rule) { struct mlx5hws_matcher *matcher = rule->matcher; struct mlx5hws_matcher_action_ste *action_ste; struct mlx5hws_pool_chunk ste = {0}; int ret; - action_ste = &matcher->action_ste[action_ste_selector]; + action_ste = &matcher->action_ste; ste.order = ilog2(roundup_pow_of_two(action_ste->max_stes)); ret = mlx5hws_pool_chunk_alloc(action_ste->pool, &ste); if (unlikely(ret)) { @@ -220,68 +210,29 @@ static int hws_rule_alloc_action_ste_idx(struct mlx5hws_rule *rule, "Failed to allocate STE for rule actions"); return ret; } - rule->action_ste_idx = ste.offset; + + rule->action_ste.pool = matcher->action_ste.pool; + rule->action_ste.num_stes = matcher->action_ste.max_stes; + rule->action_ste.index = ste.offset; return 0; } -static void hws_rule_free_action_ste_idx(struct mlx5hws_rule *rule, - u8 action_ste_selector) +void mlx5hws_rule_free_action_ste(struct mlx5hws_rule_action_ste_info *action_ste) { - struct mlx5hws_matcher *matcher = rule->matcher; struct mlx5hws_pool_chunk ste = {0}; - struct mlx5hws_pool *pool; - u8 max_stes; - - if (mlx5hws_matcher_is_resizable(matcher)) { - /* Free the original action pool if rule was resized */ - max_stes = rule->resize_info->max_stes; - pool = rule->resize_info->action_ste_pool[action_ste_selector]; - } else { - max_stes = matcher->action_ste[action_ste_selector].max_stes; - pool = matcher->action_ste[action_ste_selector].pool; - } - - /* This release is safe only when the rule match part was deleted */ - ste.order = ilog2(roundup_pow_of_two(max_stes)); - ste.offset = rule->action_ste_idx; - - mlx5hws_pool_chunk_free(pool, &ste); -} -static int hws_rule_alloc_action_ste(struct mlx5hws_rule *rule, - struct mlx5hws_rule_attr *attr) -{ - int action_ste_idx; - int ret; - - ret = hws_rule_alloc_action_ste_idx(rule, 0); - if (unlikely(ret)) - return ret; - - action_ste_idx = rule->action_ste_idx; - - ret = hws_rule_alloc_action_ste_idx(rule, 1); - if (unlikely(ret)) { - hws_rule_free_action_ste_idx(rule, 0); - return ret; - } - - /* Both pools have to return the same index */ - if (unlikely(rule->action_ste_idx != action_ste_idx)) { - pr_warn("HWS: allocation of action STE failed - pool indexes mismatch\n"); - return -EINVAL; - } + if (!action_ste->num_stes) + return; - return 0; -} + ste.order = ilog2(roundup_pow_of_two(action_ste->num_stes)); + ste.offset = action_ste->index; -void mlx5hws_rule_free_action_ste(struct mlx5hws_rule *rule) -{ - if (rule->action_ste_idx > -1) { - hws_rule_free_action_ste_idx(rule, 1); - hws_rule_free_action_ste_idx(rule, 0); - } + /* This release is safe only when the rule match STE was deleted + * (when the rule is being deleted) or replaced with the new STE that + * isn't pointing to old action STEs (when the rule is being updated). + */ + mlx5hws_pool_chunk_free(action_ste->pool, &ste); } static void hws_rule_create_init(struct mlx5hws_rule *rule, @@ -298,14 +249,24 @@ static void hws_rule_create_init(struct mlx5hws_rule *rule, /* In update we use these rtc's */ rule->rtc_0 = 0; rule->rtc_1 = 0; - rule->action_ste_selector = 0; + + rule->action_ste.pool = NULL; + rule->action_ste.num_stes = 0; + rule->action_ste.index = -1; + + rule->status = MLX5HWS_RULE_STATUS_CREATING; } else { - rule->action_ste_selector = !rule->action_ste_selector; + rule->status = MLX5HWS_RULE_STATUS_UPDATING; } + /* Initialize the old action STE info - shallow-copy action_ste. + * In create flow this will set old_action_ste fields to initial values. + * In update flow this will save the existing action STE info, + * so that we will later use it to free old STEs. + */ + rule->old_action_ste = rule->action_ste; + rule->pending_wqes = 0; - rule->action_ste_idx = -1; - rule->status = MLX5HWS_RULE_STATUS_CREATING; /* Init default send STE attributes */ ste_attr->gta_opcode = MLX5HWS_WQE_GTA_OP_ACTIVATE; @@ -315,8 +276,8 @@ static void hws_rule_create_init(struct mlx5hws_rule *rule, /* Init default action apply */ apply->tbl_type = tbl->type; - apply->common_res = &ctx->common_res[tbl->type]; - apply->jump_to_action_stc = matcher->action_ste[0].stc.offset; + apply->common_res = &ctx->common_res; + apply->jump_to_action_stc = matcher->action_ste.stc.offset; apply->require_dep = 0; } @@ -332,8 +293,6 @@ static void hws_rule_move_init(struct mlx5hws_rule *rule, rule->rtc_1 = 0; rule->pending_wqes = 0; - rule->action_ste_idx = -1; - rule->action_ste_selector = 0; rule->status = MLX5HWS_RULE_STATUS_CREATING; rule->resize_info->state = MLX5HWS_RULE_RESIZE_STATE_WRITING; } @@ -394,21 +353,17 @@ static int hws_rule_create_hws(struct mlx5hws_rule *rule, if (action_stes) { /* Allocate action STEs for rules that need more than match STE */ - if (!is_update) { - ret = hws_rule_alloc_action_ste(rule, attr); - if (ret) { - mlx5hws_err(ctx, "Failed to allocate action memory %d", ret); - mlx5hws_send_abort_new_dep_wqe(queue); - return ret; - } + ret = hws_rule_alloc_action_ste(rule); + if (ret) { + mlx5hws_err(ctx, "Failed to allocate action memory %d", ret); + mlx5hws_send_abort_new_dep_wqe(queue); + return ret; } /* Skip RX/TX based on the dep_wqe init */ - ste_attr.rtc_0 = dep_wqe->rtc_0 ? - matcher->action_ste[rule->action_ste_selector].rtc_0_id : 0; - ste_attr.rtc_1 = dep_wqe->rtc_1 ? - matcher->action_ste[rule->action_ste_selector].rtc_1_id : 0; + ste_attr.rtc_0 = dep_wqe->rtc_0 ? matcher->action_ste.rtc_0_id : 0; + ste_attr.rtc_1 = dep_wqe->rtc_1 ? matcher->action_ste.rtc_1_id : 0; /* Action STEs are written to a specific index last to first */ - ste_attr.direct_index = rule->action_ste_idx + action_stes; + ste_attr.direct_index = rule->action_ste.index + action_stes; apply.next_direct_idx = ste_attr.direct_index; } else { apply.next_direct_idx = 0; @@ -459,7 +414,7 @@ static int hws_rule_create_hws(struct mlx5hws_rule *rule, if (!is_update) hws_rule_save_delete_info(rule, &ste_attr); - hws_rule_save_resize_info(rule, &ste_attr, is_update); + hws_rule_save_resize_info(rule, &ste_attr); mlx5hws_send_engine_inc_rule(queue); if (!attr->burst) @@ -480,7 +435,10 @@ static void hws_rule_destroy_failed_hws(struct mlx5hws_rule *rule, attr->user_data, MLX5HWS_RULE_STATUS_DELETED); /* Rule failed now we can safely release action STEs */ - mlx5hws_rule_free_action_ste(rule); + mlx5hws_rule_free_action_ste(&rule->action_ste); + + /* Perhaps the rule failed updating - release old action STEs as well */ + mlx5hws_rule_free_action_ste(&rule->old_action_ste); /* Clear complex tag */ hws_rule_clear_delete_info(rule); @@ -517,7 +475,8 @@ static int hws_rule_destroy_hws(struct mlx5hws_rule *rule, } /* Rule is not completed yet */ - if (rule->status == MLX5HWS_RULE_STATUS_CREATING) + if (rule->status == MLX5HWS_RULE_STATUS_CREATING || + rule->status == MLX5HWS_RULE_STATUS_UPDATING) return -EBUSY; /* Rule failed and doesn't require cleanup */ @@ -534,7 +493,7 @@ static int hws_rule_destroy_hws(struct mlx5hws_rule *rule, hws_rule_gen_comp(queue, rule, false, attr->user_data, MLX5HWS_RULE_STATUS_DELETED); - mlx5hws_rule_free_action_ste(rule); + mlx5hws_rule_free_action_ste(&rule->action_ste); mlx5hws_rule_clear_resize_info(rule); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h index 495cdd17e9f3..b5ee94ac449b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h @@ -15,6 +15,8 @@ enum mlx5hws_rule_status { MLX5HWS_RULE_STATUS_UNKNOWN, MLX5HWS_RULE_STATUS_CREATING, MLX5HWS_RULE_STATUS_CREATED, + MLX5HWS_RULE_STATUS_UPDATING, + MLX5HWS_RULE_STATUS_UPDATED, MLX5HWS_RULE_STATUS_DELETING, MLX5HWS_RULE_STATUS_DELETED, MLX5HWS_RULE_STATUS_FAILING, @@ -41,13 +43,17 @@ struct mlx5hws_rule_match_tag { }; }; +struct mlx5hws_rule_action_ste_info { + struct mlx5hws_pool *pool; + int index; /* STE array index */ + u8 num_stes; +}; + struct mlx5hws_rule_resize_info { - struct mlx5hws_pool *action_ste_pool[2]; u32 rtc_0; u32 rtc_1; u32 rule_idx; u8 state; - u8 max_stes; u8 ctrl_seg[MLX5HWS_WQE_SZ_GTA_CTRL]; /* Ctrl segment of STE: 48 bytes */ u8 data_seg[MLX5HWS_WQE_SZ_GTA_DATA]; /* Data segment of STE: 64 bytes */ }; @@ -58,18 +64,18 @@ struct mlx5hws_rule { struct mlx5hws_rule_match_tag tag; struct mlx5hws_rule_resize_info *resize_info; }; + struct mlx5hws_rule_action_ste_info action_ste; + struct mlx5hws_rule_action_ste_info old_action_ste; u32 rtc_0; /* The RTC into which the STE was inserted */ u32 rtc_1; /* The RTC into which the STE was inserted */ - int action_ste_idx; /* STE array index */ u8 status; /* enum mlx5hws_rule_status */ - u8 action_ste_selector; /* For rule update - which action STE is in use */ u8 pending_wqes; bool skip_delete; /* For complex rules - another rule with same tag * still exists, so don't actually delete this rule. */ }; -void mlx5hws_rule_free_action_ste(struct mlx5hws_rule *rule); +void mlx5hws_rule_free_action_ste(struct mlx5hws_rule_action_ste_info *action_ste); int mlx5hws_rule_move_hws_remove(struct mlx5hws_rule *rule, void *queue, void *user_data); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c index 20fe126ffd22..cb6abc4ab7df 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c @@ -377,17 +377,25 @@ static void hws_send_engine_update_rule(struct mlx5hws_send_engine *queue, *status = MLX5HWS_FLOW_OP_ERROR; } else { - /* Increase the status, this only works on good flow as the enum - * is arrange it away creating -> created -> deleting -> deleted + /* Increase the status, this only works on good flow as + * the enum is arranged this way: + * - creating -> created + * - updating -> updated + * - deleting -> deleted */ priv->rule->status++; *status = MLX5HWS_FLOW_OP_SUCCESS; - /* Rule was deleted now we can safely release action STEs - * and clear resize info - */ if (priv->rule->status == MLX5HWS_RULE_STATUS_DELETED) { - mlx5hws_rule_free_action_ste(priv->rule); + /* Rule was deleted, now we can safely release + * action STEs and clear resize info + */ + mlx5hws_rule_free_action_ste(&priv->rule->action_ste); mlx5hws_rule_clear_resize_info(priv->rule); + } else if (priv->rule->status == MLX5HWS_RULE_STATUS_UPDATED) { + /* Rule was updated, free the old action STEs */ + mlx5hws_rule_free_action_ste(&priv->rule->old_action_ste); + /* Update completed - move the rule back to "created" */ + priv->rule->status = MLX5HWS_RULE_STATUS_CREATED; } } } @@ -633,6 +641,7 @@ static int hws_send_ring_create_sq(struct mlx5_core_dev *mdev, u32 pdn, MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST); MLX5_SET(sqc, sqc, flush_in_error_en, 1); + MLX5_SET(sqc, sqc, non_wire, 1); ts_format = mlx5_is_real_time_sq(mdev) ? MLX5_TIMESTAMP_FORMAT_REAL_TIME : MLX5_TIMESTAMP_FORMAT_FREE_RUNNING; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c index 9576e02d00c3..ab1297531232 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c @@ -37,6 +37,7 @@ static void hws_table_set_cap_attr(struct mlx5hws_table *tbl, } static int hws_table_up_default_fdb_miss_tbl(struct mlx5hws_table *tbl) +__must_hold(&tbl->ctx->ctrl_lock) { struct mlx5hws_cmd_ft_create_attr ft_attr = {0}; struct mlx5hws_cmd_set_fte_attr fte_attr = {0}; @@ -48,8 +49,8 @@ static int hws_table_up_default_fdb_miss_tbl(struct mlx5hws_table *tbl) if (tbl->type != MLX5HWS_TABLE_TYPE_FDB) return 0; - if (ctx->common_res[tbl_type].default_miss) { - ctx->common_res[tbl_type].default_miss->refcount++; + if (ctx->common_res.default_miss) { + ctx->common_res.default_miss->refcount++; return 0; } @@ -70,29 +71,28 @@ static int hws_table_up_default_fdb_miss_tbl(struct mlx5hws_table *tbl) return -EINVAL; } - /* ctx->ctrl_lock must be held here */ - ctx->common_res[tbl_type].default_miss = default_miss; - ctx->common_res[tbl_type].default_miss->refcount++; + ctx->common_res.default_miss = default_miss; + ctx->common_res.default_miss->refcount++; return 0; } /* Called under ctx->ctrl_lock */ static void hws_table_down_default_fdb_miss_tbl(struct mlx5hws_table *tbl) +__must_hold(&tbl->ctx->ctrl_lock) { struct mlx5hws_cmd_forward_tbl *default_miss; struct mlx5hws_context *ctx = tbl->ctx; - u8 tbl_type = tbl->type; if (tbl->type != MLX5HWS_TABLE_TYPE_FDB) return; - default_miss = ctx->common_res[tbl_type].default_miss; + default_miss = ctx->common_res.default_miss; if (--default_miss->refcount) return; mlx5hws_cmd_forward_tbl_destroy(ctx->mdev, default_miss); - ctx->common_res[tbl_type].default_miss = NULL; + ctx->common_res.default_miss = NULL; } static int hws_table_connect_to_default_miss_tbl(struct mlx5hws_table *tbl, u32 ft_id) @@ -478,15 +478,9 @@ int mlx5hws_table_set_default_miss(struct mlx5hws_table *tbl, if (old_miss_tbl) list_del_init(&tbl->default_miss.next); - old_miss_tbl = tbl->default_miss.miss_tbl; - if (old_miss_tbl) - list_del_init(&old_miss_tbl->default_miss.head); - if (miss_tbl) list_add(&tbl->default_miss.next, &miss_tbl->default_miss.head); - mutex_unlock(&ctx->ctrl_lock); - return 0; out: mutex_unlock(&ctx->ctrl_lock); return ret; diff --git a/drivers/net/ethernet/meta/fbnic/Makefile b/drivers/net/ethernet/meta/fbnic/Makefile index 239b2258ec65..ea6214ca48e7 100644 --- a/drivers/net/ethernet/meta/fbnic/Makefile +++ b/drivers/net/ethernet/meta/fbnic/Makefile @@ -13,7 +13,6 @@ fbnic-y := fbnic_csr.o \ fbnic_ethtool.o \ fbnic_fw.o \ fbnic_hw_stats.o \ - fbnic_hwmon.o \ fbnic_irq.o \ fbnic_mac.o \ fbnic_netdev.o \ diff --git a/drivers/net/ethernet/meta/fbnic/fbnic.h b/drivers/net/ethernet/meta/fbnic/fbnic.h index 14751f16e125..50f97f5399ff 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic.h @@ -24,7 +24,6 @@ struct fbnic_dev { struct device *dev; struct net_device *netdev; struct dentry *dbg_fbd; - struct device *hwmon; u32 __iomem *uc_addr0; u32 __iomem *uc_addr4; @@ -42,7 +41,6 @@ struct fbnic_dev { struct fbnic_fw_mbx mbx[FBNIC_IPC_MBX_INDICES]; struct fbnic_fw_cap fw_cap; - struct fbnic_fw_completion *cmpl_data; /* Lock protecting Tx Mailbox queue to prevent possible races */ spinlock_t fw_tx_lock; @@ -151,9 +149,6 @@ void fbnic_devlink_unregister(struct fbnic_dev *fbd); int fbnic_fw_enable_mbx(struct fbnic_dev *fbd); void fbnic_fw_disable_mbx(struct fbnic_dev *fbd); -void fbnic_hwmon_register(struct fbnic_dev *fbd); -void fbnic_hwmon_unregister(struct fbnic_dev *fbd); - int fbnic_pcs_irq_enable(struct fbnic_dev *fbd); void fbnic_pcs_irq_disable(struct fbnic_dev *fbd); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw.h b/drivers/net/ethernet/meta/fbnic/fbnic_fw.h index 7cd8841920e4..221faf8c6756 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_fw.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw.h @@ -44,13 +44,6 @@ struct fbnic_fw_cap { u8 link_fec; }; -struct fbnic_fw_completion { - struct { - s32 millivolts; - s32 millidegrees; - } tsene; -}; - void fbnic_mbx_init(struct fbnic_dev *fbd); void fbnic_mbx_clean(struct fbnic_dev *fbd); void fbnic_mbx_poll(struct fbnic_dev *fbd); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_hwmon.c b/drivers/net/ethernet/meta/fbnic/fbnic_hwmon.c deleted file mode 100644 index bcd1086e3768..000000000000 --- a/drivers/net/ethernet/meta/fbnic/fbnic_hwmon.c +++ /dev/null @@ -1,81 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) Meta Platforms, Inc. and affiliates. */ - -#include <linux/hwmon.h> - -#include "fbnic.h" -#include "fbnic_mac.h" - -static int fbnic_hwmon_sensor_id(enum hwmon_sensor_types type) -{ - if (type == hwmon_temp) - return FBNIC_SENSOR_TEMP; - if (type == hwmon_in) - return FBNIC_SENSOR_VOLTAGE; - - return -EOPNOTSUPP; -} - -static umode_t fbnic_hwmon_is_visible(const void *drvdata, - enum hwmon_sensor_types type, - u32 attr, int channel) -{ - if (type == hwmon_temp && attr == hwmon_temp_input) - return 0444; - if (type == hwmon_in && attr == hwmon_in_input) - return 0444; - - return 0; -} - -static int fbnic_hwmon_read(struct device *dev, enum hwmon_sensor_types type, - u32 attr, int channel, long *val) -{ - struct fbnic_dev *fbd = dev_get_drvdata(dev); - const struct fbnic_mac *mac = fbd->mac; - int id; - - id = fbnic_hwmon_sensor_id(type); - return id < 0 ? id : mac->get_sensor(fbd, id, val); -} - -static const struct hwmon_ops fbnic_hwmon_ops = { - .is_visible = fbnic_hwmon_is_visible, - .read = fbnic_hwmon_read, -}; - -static const struct hwmon_channel_info *fbnic_hwmon_info[] = { - HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT), - HWMON_CHANNEL_INFO(in, HWMON_I_INPUT), - NULL -}; - -static const struct hwmon_chip_info fbnic_chip_info = { - .ops = &fbnic_hwmon_ops, - .info = fbnic_hwmon_info, -}; - -void fbnic_hwmon_register(struct fbnic_dev *fbd) -{ - if (!IS_REACHABLE(CONFIG_HWMON)) - return; - - fbd->hwmon = hwmon_device_register_with_info(fbd->dev, "fbnic", - fbd, &fbnic_chip_info, - NULL); - if (IS_ERR(fbd->hwmon)) { - dev_notice(fbd->dev, - "Failed to register hwmon device %pe\n", - fbd->hwmon); - fbd->hwmon = NULL; - } -} - -void fbnic_hwmon_unregister(struct fbnic_dev *fbd) -{ - if (!IS_REACHABLE(CONFIG_HWMON) || !fbd->hwmon) - return; - - hwmon_device_unregister(fbd->hwmon); - fbd->hwmon = NULL; -} diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_mac.c b/drivers/net/ethernet/meta/fbnic/fbnic_mac.c index 80b82ff12c4d..7b654d0a6dac 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_mac.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_mac.c @@ -686,27 +686,6 @@ fbnic_mac_get_eth_mac_stats(struct fbnic_dev *fbd, bool reset, MAC_STAT_TX_BROADCAST); } -static int fbnic_mac_get_sensor_asic(struct fbnic_dev *fbd, int id, long *val) -{ - struct fbnic_fw_completion fw_cmpl; - s32 *sensor; - - switch (id) { - case FBNIC_SENSOR_TEMP: - sensor = &fw_cmpl.tsene.millidegrees; - break; - case FBNIC_SENSOR_VOLTAGE: - sensor = &fw_cmpl.tsene.millivolts; - break; - default: - return -EINVAL; - } - - *val = *sensor; - - return 0; -} - static const struct fbnic_mac fbnic_mac_asic = { .init_regs = fbnic_mac_init_regs, .pcs_enable = fbnic_pcs_enable_asic, @@ -716,7 +695,6 @@ static const struct fbnic_mac fbnic_mac_asic = { .get_eth_mac_stats = fbnic_mac_get_eth_mac_stats, .link_down = fbnic_mac_link_down_asic, .link_up = fbnic_mac_link_up_asic, - .get_sensor = fbnic_mac_get_sensor_asic, }; /** diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_mac.h b/drivers/net/ethernet/meta/fbnic/fbnic_mac.h index 05a591653e09..476239a9d381 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_mac.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_mac.h @@ -47,11 +47,6 @@ enum { #define FBNIC_LINK_MODE_PAM4 (FBNIC_LINK_50R1) #define FBNIC_LINK_MODE_MASK (FBNIC_LINK_AUTO - 1) -enum fbnic_sensor_id { - FBNIC_SENSOR_TEMP, /* Temp in millidegrees Centigrade */ - FBNIC_SENSOR_VOLTAGE, /* Voltage in millivolts */ -}; - /* This structure defines the interface hooks for the MAC. The MAC hooks * will be configured as a const struct provided with a set of function * pointers. @@ -88,8 +83,6 @@ struct fbnic_mac { void (*link_down)(struct fbnic_dev *fbd); void (*link_up)(struct fbnic_dev *fbd, bool tx_pause, bool rx_pause); - - int (*get_sensor)(struct fbnic_dev *fbd, int id, long *val); }; int fbnic_mac_init(struct fbnic_dev *fbd); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c index 6cbbc2ee3e1f..2c96980d150d 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c @@ -296,8 +296,6 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /* Capture snapshot of hardware stats so netdev can calculate delta */ fbnic_reset_hw_stats(fbd); - fbnic_hwmon_register(fbd); - if (!fbd->dsn) { dev_warn(&pdev->dev, "Reading serial number failed\n"); goto init_failure_mode; @@ -360,7 +358,6 @@ static void fbnic_remove(struct pci_dev *pdev) fbnic_netdev_free(fbd); } - fbnic_hwmon_unregister(fbd); fbnic_dbg_fbd_exit(fbd); fbnic_devlink_unregister(fbd); fbnic_fw_disable_mbx(fbd); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c index bb54ce5f5787..d4d7027df9a0 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c @@ -1033,7 +1033,7 @@ static int fbnic_poll(struct napi_struct *napi, int budget) if (likely(napi_complete_done(napi, work_done))) fbnic_nv_irq_rearm(nv); - return 0; + return work_done; } irqreturn_t fbnic_msix_clean_rings(int __always_unused irq, void *data) diff --git a/drivers/net/ethernet/realtek/r8169.h b/drivers/net/ethernet/realtek/r8169.h index e0817f2a311a..7a194a8ab989 100644 --- a/drivers/net/ethernet/realtek/r8169.h +++ b/drivers/net/ethernet/realtek/r8169.h @@ -70,6 +70,7 @@ enum mac_version { RTL_GIGA_MAC_VER_63, RTL_GIGA_MAC_VER_64, RTL_GIGA_MAC_VER_65, + RTL_GIGA_MAC_VER_66, RTL_GIGA_MAC_VER_70, RTL_GIGA_MAC_VER_71, RTL_GIGA_MAC_NONE diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 5724f650f9c6..4b77f2151204 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -58,6 +58,7 @@ #define FIRMWARE_8125B_2 "rtl_nic/rtl8125b-2.fw" #define FIRMWARE_8125D_1 "rtl_nic/rtl8125d-1.fw" #define FIRMWARE_8125D_2 "rtl_nic/rtl8125d-2.fw" +#define FIRMWARE_8125BP_2 "rtl_nic/rtl8125bp-2.fw" #define FIRMWARE_8126A_2 "rtl_nic/rtl8126a-2.fw" #define FIRMWARE_8126A_3 "rtl_nic/rtl8126a-3.fw" @@ -142,6 +143,7 @@ static const struct { [RTL_GIGA_MAC_VER_63] = {"RTL8125B", FIRMWARE_8125B_2}, [RTL_GIGA_MAC_VER_64] = {"RTL8125D", FIRMWARE_8125D_1}, [RTL_GIGA_MAC_VER_65] = {"RTL8125D", FIRMWARE_8125D_2}, + [RTL_GIGA_MAC_VER_66] = {"RTL8125BP", FIRMWARE_8125BP_2}, [RTL_GIGA_MAC_VER_70] = {"RTL8126A", FIRMWARE_8126A_2}, [RTL_GIGA_MAC_VER_71] = {"RTL8126A", FIRMWARE_8126A_3}, }; @@ -632,6 +634,7 @@ enum rtl_dash_type { RTL_DASH_NONE, RTL_DASH_DP, RTL_DASH_EP, + RTL_DASH_25_BP, }; struct rtl8169_private { @@ -709,6 +712,7 @@ MODULE_FIRMWARE(FIRMWARE_8125A_3); MODULE_FIRMWARE(FIRMWARE_8125B_2); MODULE_FIRMWARE(FIRMWARE_8125D_1); MODULE_FIRMWARE(FIRMWARE_8125D_2); +MODULE_FIRMWARE(FIRMWARE_8125BP_2); MODULE_FIRMWARE(FIRMWARE_8126A_2); MODULE_FIRMWARE(FIRMWARE_8126A_3); @@ -1361,10 +1365,19 @@ static void rtl8168ep_driver_start(struct rtl8169_private *tp) rtl_loop_wait_high(tp, &rtl_ep_ocp_read_cond, 10000, 30); } +static void rtl8125bp_driver_start(struct rtl8169_private *tp) +{ + r8168ep_ocp_write(tp, 0x01, 0x14, OOB_CMD_DRIVER_START); + r8168ep_ocp_write(tp, 0x01, 0x18, 0x00); + r8168ep_ocp_write(tp, 0x01, 0x10, 0x01); +} + static void rtl8168_driver_start(struct rtl8169_private *tp) { if (tp->dash_type == RTL_DASH_DP) rtl8168dp_driver_start(tp); + else if (tp->dash_type == RTL_DASH_25_BP) + rtl8125bp_driver_start(tp); else rtl8168ep_driver_start(tp); } @@ -1385,10 +1398,19 @@ static void rtl8168ep_driver_stop(struct rtl8169_private *tp) rtl_loop_wait_low(tp, &rtl_ep_ocp_read_cond, 10000, 10); } +static void rtl8125bp_driver_stop(struct rtl8169_private *tp) +{ + r8168ep_ocp_write(tp, 0x01, 0x14, OOB_CMD_DRIVER_STOP); + r8168ep_ocp_write(tp, 0x01, 0x18, 0x00); + r8168ep_ocp_write(tp, 0x01, 0x10, 0x01); +} + static void rtl8168_driver_stop(struct rtl8169_private *tp) { if (tp->dash_type == RTL_DASH_DP) rtl8168dp_driver_stop(tp); + else if (tp->dash_type == RTL_DASH_25_BP) + rtl8125bp_driver_stop(tp); else rtl8168ep_driver_stop(tp); } @@ -1411,6 +1433,7 @@ static bool rtl_dash_is_enabled(struct rtl8169_private *tp) case RTL_DASH_DP: return r8168dp_check_dash(tp); case RTL_DASH_EP: + case RTL_DASH_25_BP: return r8168ep_check_dash(tp); default: return false; @@ -1425,6 +1448,8 @@ static enum rtl_dash_type rtl_get_dash_type(struct rtl8169_private *tp) return RTL_DASH_DP; case RTL_GIGA_MAC_VER_51 ... RTL_GIGA_MAC_VER_53: return RTL_DASH_EP; + case RTL_GIGA_MAC_VER_66: + return RTL_DASH_25_BP; default: return RTL_DASH_NONE; } @@ -2261,6 +2286,9 @@ static enum mac_version rtl8169_get_mac_version(u16 xid, bool gmii) { 0x7cf, 0x64a, RTL_GIGA_MAC_VER_71 }, { 0x7cf, 0x649, RTL_GIGA_MAC_VER_70 }, + /* 8125BP family. */ + { 0x7cf, 0x681, RTL_GIGA_MAC_VER_66 }, + /* 8125D family. */ { 0x7cf, 0x689, RTL_GIGA_MAC_VER_65 }, { 0x7cf, 0x688, RTL_GIGA_MAC_VER_64 }, @@ -3842,6 +3870,7 @@ static void rtl_hw_config(struct rtl8169_private *tp) [RTL_GIGA_MAC_VER_63] = rtl_hw_start_8125b, [RTL_GIGA_MAC_VER_64] = rtl_hw_start_8125d, [RTL_GIGA_MAC_VER_65] = rtl_hw_start_8125d, + [RTL_GIGA_MAC_VER_66] = rtl_hw_start_8125d, [RTL_GIGA_MAC_VER_70] = rtl_hw_start_8126a, [RTL_GIGA_MAC_VER_71] = rtl_hw_start_8126a, }; @@ -3861,6 +3890,7 @@ static void rtl_hw_start_8125(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_61: case RTL_GIGA_MAC_VER_64: case RTL_GIGA_MAC_VER_65: + case RTL_GIGA_MAC_VER_66: for (i = 0xa00; i < 0xb00; i += 4) RTL_W32(tp, i, 0); break; diff --git a/drivers/net/ethernet/realtek/r8169_phy_config.c b/drivers/net/ethernet/realtek/r8169_phy_config.c index 968c8a2185a4..cf95e579c65d 100644 --- a/drivers/net/ethernet/realtek/r8169_phy_config.c +++ b/drivers/net/ethernet/realtek/r8169_phy_config.c @@ -1102,6 +1102,28 @@ static void rtl8125d_hw_phy_config(struct rtl8169_private *tp, rtl8125_config_eee_phy(phydev); } +static void rtl8125bp_hw_phy_config(struct rtl8169_private *tp, + struct phy_device *phydev) +{ + r8169_apply_firmware(tp); + rtl8168g_enable_gphy_10m(phydev); + + r8168g_phy_param(phydev, 0x8010, 0x0800, 0x0000); + + phy_write(phydev, 0x1f, 0x0b87); + phy_write(phydev, 0x16, 0x8088); + phy_modify(phydev, 0x17, 0xff00, 0x9000); + phy_write(phydev, 0x16, 0x808f); + phy_modify(phydev, 0x17, 0xff00, 0x9000); + phy_write(phydev, 0x1f, 0x0000); + + r8168g_phy_param(phydev, 0x8174, 0x2000, 0x1800); + + rtl8125_legacy_force_mode(phydev); + rtl8168g_disable_aldps(phydev); + rtl8125_config_eee_phy(phydev); +} + static void rtl8126a_hw_phy_config(struct rtl8169_private *tp, struct phy_device *phydev) { @@ -1163,6 +1185,7 @@ void r8169_hw_phy_config(struct rtl8169_private *tp, struct phy_device *phydev, [RTL_GIGA_MAC_VER_63] = rtl8125b_hw_phy_config, [RTL_GIGA_MAC_VER_64] = rtl8125d_hw_phy_config, [RTL_GIGA_MAC_VER_65] = rtl8125d_hw_phy_config, + [RTL_GIGA_MAC_VER_66] = rtl8125bp_hw_phy_config, [RTL_GIGA_MAC_VER_70] = rtl8126a_hw_phy_config, [RTL_GIGA_MAC_VER_71] = rtl8126a_hw_phy_config, }; diff --git a/drivers/net/ethernet/realtek/rtase/rtase_main.c b/drivers/net/ethernet/realtek/rtase/rtase_main.c index 585d0b21c9e0..3bd11cb56294 100644 --- a/drivers/net/ethernet/realtek/rtase/rtase_main.c +++ b/drivers/net/ethernet/realtek/rtase/rtase_main.c @@ -1828,7 +1828,7 @@ static int rtase_alloc_msix(struct pci_dev *pdev, struct rtase_private *tp) for (i = 0; i < tp->int_nums; i++) { irq = pci_irq_vector(pdev, i); - if (!irq) { + if (irq < 0) { pci_disable_msix(pdev); return irq; } diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c index eabc4da9e1a9..de9b6dfef15b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c @@ -313,7 +313,6 @@ static void sti_dwmac_remove(struct platform_device *pdev) clk_disable_unprepare(dwmac->clk); } -#ifdef CONFIG_PM_SLEEP static int sti_dwmac_suspend(struct device *dev) { struct sti_dwmac *dwmac = get_stmmac_bsp_priv(dev); @@ -333,10 +332,9 @@ static int sti_dwmac_resume(struct device *dev) return stmmac_resume(dev); } -#endif /* CONFIG_PM_SLEEP */ -static SIMPLE_DEV_PM_OPS(sti_dwmac_pm_ops, sti_dwmac_suspend, - sti_dwmac_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(sti_dwmac_pm_ops, sti_dwmac_suspend, + sti_dwmac_resume); static const struct sti_dwmac_of_data stih4xx_dwmac_data = { .fix_retime_src = stih4xx_fix_retime_src, @@ -353,7 +351,7 @@ static struct platform_driver sti_dwmac_driver = { .remove = sti_dwmac_remove, .driver = { .name = "sti-dwmac", - .pm = &sti_dwmac_pm_ops, + .pm = pm_sleep_ptr(&sti_dwmac_pm_ops), .of_match_table = sti_dwmac_match, }, }; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c index 3827997d2132..dc903b846b1b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only +#include <linux/iommu.h> #include <linux/platform_device.h> #include <linux/of.h> #include <linux/module.h> @@ -19,6 +20,8 @@ struct tegra_mgbe { struct reset_control *rst_mac; struct reset_control *rst_pcs; + u32 iommu_sid; + void __iomem *hv; void __iomem *regs; void __iomem *xpcs; @@ -50,7 +53,6 @@ struct tegra_mgbe { #define MGBE_WRAP_COMMON_INTR_ENABLE 0x8704 #define MAC_SBD_INTR BIT(2) #define MGBE_WRAP_AXI_ASID0_CTRL 0x8400 -#define MGBE_SID 0x6 static int __maybe_unused tegra_mgbe_suspend(struct device *dev) { @@ -84,7 +86,7 @@ static int __maybe_unused tegra_mgbe_resume(struct device *dev) writel(MAC_SBD_INTR, mgbe->regs + MGBE_WRAP_COMMON_INTR_ENABLE); /* Program SID */ - writel(MGBE_SID, mgbe->hv + MGBE_WRAP_AXI_ASID0_CTRL); + writel(mgbe->iommu_sid, mgbe->hv + MGBE_WRAP_AXI_ASID0_CTRL); value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_STATUS); if ((value & XPCS_WRAP_UPHY_STATUS_TX_P_UP) == 0) { @@ -241,6 +243,12 @@ static int tegra_mgbe_probe(struct platform_device *pdev) if (IS_ERR(mgbe->xpcs)) return PTR_ERR(mgbe->xpcs); + /* get controller's stream id from iommu property in device tree */ + if (!tegra_dev_iommu_get_stream_id(mgbe->dev, &mgbe->iommu_sid)) { + dev_err(mgbe->dev, "failed to get iommu stream id\n"); + return -EINVAL; + } + res.addr = mgbe->regs; res.irq = irq; @@ -346,7 +354,7 @@ static int tegra_mgbe_probe(struct platform_device *pdev) writel(MAC_SBD_INTR, mgbe->regs + MGBE_WRAP_COMMON_INTR_ENABLE); /* Program SID */ - writel(MGBE_SID, mgbe->hv + MGBE_WRAP_AXI_ASID0_CTRL); + writel(mgbe->iommu_sid, mgbe->hv + MGBE_WRAP_AXI_ASID0_CTRL); plat->flags |= STMMAC_FLAG_SERDES_UP_AFTER_PHY_LINKUP; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index c36f90a782c5..9ed8620580a8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -420,10 +420,10 @@ static void dwmac4_set_eee_pls(struct mac_device_info *hw, int link) writel(value, ioaddr + GMAC4_LPI_CTRL_STATUS); } -static void dwmac4_set_eee_lpi_entry_timer(struct mac_device_info *hw, int et) +static void dwmac4_set_eee_lpi_entry_timer(struct mac_device_info *hw, u32 et) { void __iomem *ioaddr = hw->pcsr; - int value = et & STMMAC_ET_MAX; + u32 value = et & STMMAC_ET_MAX; int regval; /* Program LPI entry timer value into register */ diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h index 2f7295b6c1c5..0f200b72c225 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.h +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h @@ -363,7 +363,7 @@ struct stmmac_ops { void (*set_eee_mode)(struct mac_device_info *hw, bool en_tx_lpi_clockgating); void (*reset_eee_mode)(struct mac_device_info *hw); - void (*set_eee_lpi_entry_timer)(struct mac_device_info *hw, int et); + void (*set_eee_lpi_entry_timer)(struct mac_device_info *hw, u32 et); void (*set_eee_timer)(struct mac_device_info *hw, int ls, int tw); void (*set_eee_pls)(struct mac_device_info *hw, int link); void (*debug)(struct stmmac_priv *priv, void __iomem *ioaddr, diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index b8d631e559c0..e8dbce20129c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -305,11 +305,9 @@ struct stmmac_priv { int clk_csr; struct timer_list eee_ctrl_timer; int lpi_irq; - int eee_enabled; - int eee_active; - int tx_lpi_timer; - int tx_lpi_enabled; - int eee_tw_timer; + u32 tx_lpi_timer; + bool eee_enabled; + bool eee_active; bool eee_sw_timer_en; unsigned int mode; unsigned int chain_mode; @@ -405,8 +403,6 @@ void stmmac_dvr_remove(struct device *dev); int stmmac_dvr_probe(struct device *device, struct plat_stmmacenet_data *plat_dat, struct stmmac_resources *res); -void stmmac_disable_eee_mode(struct stmmac_priv *priv); -bool stmmac_eee_init(struct stmmac_priv *priv); int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt); int stmmac_reinit_ringparam(struct net_device *dev, u32 rx_size, u32 tx_size); int stmmac_bus_clks_config(struct stmmac_priv *priv, bool enabled); @@ -416,14 +412,6 @@ static inline bool stmmac_xdp_is_enabled(struct stmmac_priv *priv) return !!priv->xdp_prog; } -static inline unsigned int stmmac_rx_offset(struct stmmac_priv *priv) -{ - if (stmmac_xdp_is_enabled(priv)) - return XDP_PACKET_HEADROOM; - - return 0; -} - void stmmac_disable_rx_queue(struct stmmac_priv *priv, u32 queue); void stmmac_enable_rx_queue(struct stmmac_priv *priv, u32 queue); void stmmac_disable_tx_queue(struct stmmac_priv *priv, u32 queue); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index 16b4d8c21c90..918a32f8fda8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -654,7 +654,7 @@ static void stmmac_get_ethtool_stats(struct net_device *dev, (*(u32 *)p); } } - if (priv->eee_enabled) { + if (priv->dma_cap.eee) { int val = phylink_get_eee_err(priv->phylink); if (val) priv->xstats.phy_eee_wakeup_error_n = val; @@ -898,9 +898,6 @@ static int stmmac_ethtool_op_get_eee(struct net_device *dev, if (!priv->dma_cap.eee) return -EOPNOTSUPP; - edata->tx_lpi_timer = priv->tx_lpi_timer; - edata->tx_lpi_enabled = priv->tx_lpi_enabled; - return phylink_ethtool_get_eee(priv->phylink, edata); } @@ -908,29 +905,11 @@ static int stmmac_ethtool_op_set_eee(struct net_device *dev, struct ethtool_keee *edata) { struct stmmac_priv *priv = netdev_priv(dev); - int ret; if (!priv->dma_cap.eee) return -EOPNOTSUPP; - if (priv->tx_lpi_enabled != edata->tx_lpi_enabled) - netdev_warn(priv->dev, - "Setting EEE tx-lpi is not supported\n"); - - if (!edata->eee_enabled) - stmmac_disable_eee_mode(priv); - - ret = phylink_ethtool_set_eee(priv->phylink, edata); - if (ret) - return ret; - - if (edata->eee_enabled && - priv->tx_lpi_timer != edata->tx_lpi_timer) { - priv->tx_lpi_timer = edata->tx_lpi_timer; - stmmac_eee_init(priv); - } - - return 0; + return phylink_ethtool_set_eee(priv->phylink, edata); } static u32 stmmac_usec2riwt(u32 usec, struct stmmac_priv *priv) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 99eaec8bac4a..58b013528dea 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -111,8 +111,8 @@ static const u32 default_msg_level = (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_IFDOWN | NETIF_MSG_TIMER); #define STMMAC_DEFAULT_LPI_TIMER 1000 -static int eee_timer = STMMAC_DEFAULT_LPI_TIMER; -module_param(eee_timer, int, 0644); +static unsigned int eee_timer = STMMAC_DEFAULT_LPI_TIMER; +module_param(eee_timer, uint, 0644); MODULE_PARM_DESC(eee_timer, "LPI tx expiration time in msec"); #define STMMAC_LPI_T(x) (jiffies + usecs_to_jiffies(x)) @@ -194,8 +194,6 @@ static void stmmac_verify_args(void) flow_ctrl = FLOW_OFF; if (unlikely((pause < 0) || (pause > 0xffff))) pause = PAUSE_TIME; - if (eee_timer < 0) - eee_timer = STMMAC_DEFAULT_LPI_TIMER; } static void __stmmac_disable_all_queues(struct stmmac_priv *priv) @@ -392,14 +390,14 @@ static inline u32 stmmac_rx_dirty(struct stmmac_priv *priv, u32 queue) return dirty; } -static void stmmac_lpi_entry_timer_config(struct stmmac_priv *priv, bool en) +static void stmmac_disable_hw_lpi_timer(struct stmmac_priv *priv) { - int tx_lpi_timer; + stmmac_set_eee_lpi_timer(priv, priv->hw, 0); +} - /* Clear/set the SW EEE timer flag based on LPI ET enablement */ - priv->eee_sw_timer_en = en ? 0 : 1; - tx_lpi_timer = en ? priv->tx_lpi_timer : 0; - stmmac_set_eee_lpi_timer(priv, priv->hw, tx_lpi_timer); +static void stmmac_enable_hw_lpi_timer(struct stmmac_priv *priv) +{ + stmmac_set_eee_lpi_timer(priv, priv->hw, priv->tx_lpi_timer); } /** @@ -429,18 +427,13 @@ static int stmmac_enable_eee_mode(struct stmmac_priv *priv) } /** - * stmmac_disable_eee_mode - disable and exit from LPI mode + * stmmac_disable_sw_eee_mode - disable and exit from LPI mode * @priv: driver private structure * Description: this function is to exit and disable EEE in case of * LPI state is true. This is called by the xmit. */ -void stmmac_disable_eee_mode(struct stmmac_priv *priv) +static void stmmac_disable_sw_eee_mode(struct stmmac_priv *priv) { - if (!priv->eee_sw_timer_en) { - stmmac_lpi_entry_timer_config(priv, 0); - return; - } - stmmac_reset_eee_mode(priv, priv->hw); del_timer_sync(&priv->eee_ctrl_timer); priv->tx_path_in_lpi_mode = false; @@ -464,18 +457,21 @@ static void stmmac_eee_ctrl_timer(struct timer_list *t) /** * stmmac_eee_init - init EEE * @priv: driver private structure + * @active: indicates whether EEE should be enabled. * Description: * if the GMAC supports the EEE (from the HW cap reg) and the phy device * can also manage EEE, this function enable the LPI state and start related * timer. */ -bool stmmac_eee_init(struct stmmac_priv *priv) +static void stmmac_eee_init(struct stmmac_priv *priv, bool active) { - int eee_tw_timer = priv->eee_tw_timer; + priv->eee_active = active; /* Check if MAC core supports the EEE feature. */ - if (!priv->dma_cap.eee) - return false; + if (!priv->dma_cap.eee) { + priv->eee_enabled = false; + return; + } mutex_lock(&priv->lock); @@ -483,22 +479,24 @@ bool stmmac_eee_init(struct stmmac_priv *priv) if (!priv->eee_active) { if (priv->eee_enabled) { netdev_dbg(priv->dev, "disable EEE\n"); - stmmac_lpi_entry_timer_config(priv, 0); + priv->eee_sw_timer_en = true; + stmmac_disable_hw_lpi_timer(priv); del_timer_sync(&priv->eee_ctrl_timer); - stmmac_set_eee_timer(priv, priv->hw, 0, eee_tw_timer); + stmmac_set_eee_timer(priv, priv->hw, 0, + STMMAC_DEFAULT_TWT_LS); if (priv->hw->xpcs) xpcs_config_eee(priv->hw->xpcs, priv->plat->mult_fact_100ns, false); } + priv->eee_enabled = false; mutex_unlock(&priv->lock); - return false; + return; } if (priv->eee_active && !priv->eee_enabled) { - timer_setup(&priv->eee_ctrl_timer, stmmac_eee_ctrl_timer, 0); stmmac_set_eee_timer(priv, priv->hw, STMMAC_DEFAULT_LIT_LS, - eee_tw_timer); + STMMAC_DEFAULT_TWT_LS); if (priv->hw->xpcs) xpcs_config_eee(priv->hw->xpcs, priv->plat->mult_fact_100ns, @@ -506,18 +504,23 @@ bool stmmac_eee_init(struct stmmac_priv *priv) } if (priv->plat->has_gmac4 && priv->tx_lpi_timer <= STMMAC_ET_MAX) { + /* Use hardware LPI mode */ del_timer_sync(&priv->eee_ctrl_timer); priv->tx_path_in_lpi_mode = false; - stmmac_lpi_entry_timer_config(priv, 1); + priv->eee_sw_timer_en = false; + stmmac_enable_hw_lpi_timer(priv); } else { - stmmac_lpi_entry_timer_config(priv, 0); + /* Use software LPI mode */ + priv->eee_sw_timer_en = true; + stmmac_disable_hw_lpi_timer(priv); mod_timer(&priv->eee_ctrl_timer, STMMAC_LPI_T(priv->tx_lpi_timer)); } + priv->eee_enabled = true; + mutex_unlock(&priv->lock); netdev_dbg(priv->dev, "Energy-Efficient Ethernet initialized\n"); - return true; } /* stmmac_get_tx_hwtstamp - get HW TX timestamps @@ -974,9 +977,7 @@ static void stmmac_mac_link_down(struct phylink_config *config, struct stmmac_priv *priv = netdev_priv(to_net_dev(config->dev)); stmmac_mac_set(priv, priv->ioaddr, false); - priv->eee_active = false; - priv->tx_lpi_enabled = false; - priv->eee_enabled = stmmac_eee_init(priv); + stmmac_eee_init(priv, false); stmmac_set_eee_pls(priv, priv->hw, false); if (stmmac_fpe_supported(priv)) @@ -1085,11 +1086,10 @@ static void stmmac_mac_link_up(struct phylink_config *config, stmmac_mac_set(priv, priv->ioaddr, true); if (phy && priv->dma_cap.eee) { - priv->eee_active = - phy_init_eee(phy, !(priv->plat->flags & - STMMAC_FLAG_RX_CLK_RUNS_IN_LPI)) >= 0; - priv->eee_enabled = stmmac_eee_init(priv); - priv->tx_lpi_enabled = priv->eee_enabled; + phy_eee_rx_clock_stop(phy, !(priv->plat->flags & + STMMAC_FLAG_RX_CLK_RUNS_IN_LPI)); + priv->tx_lpi_timer = phy->eee_cfg.tx_lpi_timer; + stmmac_eee_init(priv, phy->enable_tx_lpi); stmmac_set_eee_pls(priv, priv->hw, true); } @@ -1187,6 +1187,16 @@ static int stmmac_init_phy(struct net_device *dev) ret = phylink_fwnode_phy_connect(priv->phylink, fwnode, 0); } + if (ret == 0) { + struct ethtool_keee eee; + + /* Configure phylib's copy of the LPI timer */ + if (!phylink_ethtool_get_eee(priv->phylink, &eee)) { + eee.tx_lpi_timer = priv->tx_lpi_timer; + phylink_ethtool_set_eee(priv->phylink, &eee); + } + } + if (!priv->plat->pmt) { struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; @@ -1203,6 +1213,7 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) struct stmmac_mdio_bus_data *mdio_bus_data; int mode = priv->plat->phy_interface; struct fwnode_handle *fwnode; + struct phylink_pcs *pcs; struct phylink *phylink; priv->phylink_config.dev = &priv->dev->dev; @@ -1224,8 +1235,14 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) /* If we have an xpcs, it defines which PHY interfaces are supported. */ if (priv->hw->xpcs) - xpcs_get_interfaces(priv->hw->xpcs, - priv->phylink_config.supported_interfaces); + pcs = xpcs_to_phylink_pcs(priv->hw->xpcs); + else + pcs = priv->hw->phylink_pcs; + + if (pcs) + phy_interface_or(priv->phylink_config.supported_interfaces, + priv->phylink_config.supported_interfaces, + pcs->supported_interfaces); fwnode = priv->plat->port_node; if (!fwnode) @@ -1308,6 +1325,14 @@ static void stmmac_display_rings(struct stmmac_priv *priv, stmmac_display_tx_rings(priv, dma_conf); } +static unsigned int stmmac_rx_offset(struct stmmac_priv *priv) +{ + if (stmmac_xdp_is_enabled(priv)) + return XDP_PACKET_HEADROOM; + + return 0; +} + static int stmmac_set_bfsize(int mtu, int bufsize) { int ret = bufsize; @@ -3437,12 +3462,6 @@ static int stmmac_hw_setup(struct net_device *dev, bool ptp_register) else if (ptp_register) stmmac_ptp_register(priv); - priv->eee_tw_timer = STMMAC_DEFAULT_TWT_LS; - - /* Convert the timer from msec to usec */ - if (!priv->tx_lpi_timer) - priv->tx_lpi_timer = eee_timer * 1000; - if (priv->use_riwt) { u32 queue; @@ -3909,6 +3928,10 @@ static int __stmmac_open(struct net_device *dev, u32 chan; int ret; + /* Initialise the tx lpi timer, converting from msec to usec */ + if (!priv->tx_lpi_timer) + priv->tx_lpi_timer = eee_timer * 1000; + ret = pm_runtime_resume_and_get(priv->device); if (ret < 0) return ret; @@ -4023,11 +4046,6 @@ static int stmmac_release(struct net_device *dev) /* Free the IRQ lines */ stmmac_free_irq(dev, REQ_IRQ_ERR_ALL, 0); - if (priv->eee_enabled) { - priv->tx_path_in_lpi_mode = false; - del_timer_sync(&priv->eee_ctrl_timer); - } - /* Stop TX/RX DMA and clear the descriptors */ stmmac_stop_all_dma(priv); @@ -4479,7 +4497,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) first_tx = tx_q->cur_tx; if (priv->tx_path_in_lpi_mode && priv->eee_sw_timer_en) - stmmac_disable_eee_mode(priv); + stmmac_disable_sw_eee_mode(priv); /* Manage oversized TCP frames for GMAC4 device */ if (skb_is_gso(skb) && priv->tso) { @@ -5460,7 +5478,7 @@ read_again: if (priv->extend_desc) stmmac_rx_extended_status(priv, &priv->xstats, rx_q->dma_erx + entry); if (unlikely(status == discard_frame)) { - page_pool_recycle_direct(rx_q->page_pool, buf->page); + page_pool_put_page(rx_q->page_pool, buf->page, 0, true); buf->page = NULL; error = 1; if (!priv->hwts_rx_en) @@ -7394,6 +7412,8 @@ int stmmac_dvr_probe(struct device *device, INIT_WORK(&priv->service_task, stmmac_service_task); + timer_setup(&priv->eee_ctrl_timer, stmmac_eee_ctrl_timer, 0); + /* Override with kernel parameters if supplied XXX CRS XXX * this needs to have multiple instances */ diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 5465bf872734..dcb6662b473d 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -762,7 +762,7 @@ static int am65_cpsw_nuss_common_open(struct am65_cpsw_common *common) ALE_DEFAULT_THREAD_ID, 0); cpsw_ale_control_set(common->ale, HOST_PORT_NUM, ALE_DEFAULT_THREAD_ENABLE, 1); - /* switch to vlan unaware mode */ + /* switch to vlan aware mode */ cpsw_ale_control_set(common->ale, HOST_PORT_NUM, ALE_VLAN_AWARE, 1); cpsw_ale_control_set(common->ale, HOST_PORT_NUM, ALE_PORT_STATE, ALE_PORT_STATE_FORWARD); diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 1e290ee8edfd..0cb6fa6e5b7d 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -686,7 +686,7 @@ static void cpsw_init_host_port(struct cpsw_priv *priv) soft_reset("cpsw", &cpsw->regs->soft_reset); cpsw_ale_start(cpsw->ale); - /* switch to vlan unaware mode */ + /* switch to vlan aware mode */ cpsw_ale_control_set(cpsw->ale, HOST_PORT_NUM, ALE_VLAN_AWARE, CPSW_ALE_VLAN_AWARE); control_reg = readl(&cpsw->regs->control); diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index be4d90c1cbe7..cec0a90659d9 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -554,7 +554,7 @@ static void cpsw_init_host_port(struct cpsw_priv *priv) soft_reset("cpsw", &cpsw->regs->soft_reset); cpsw_ale_start(cpsw->ale); - /* switch to vlan unaware mode */ + /* switch to vlan aware mode */ cpsw_ale_control_set(cpsw->ale, HOST_PORT_NUM, ALE_VLAN_AWARE, CPSW_ALE_VLAN_AWARE); control_reg = readl(&cpsw->regs->control); diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c index 1bf9c38e4125..deaf670c160e 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c @@ -334,27 +334,25 @@ int wx_host_interface_command(struct wx *wx, u32 *buffer, status = read_poll_timeout(rd32, hicr, hicr & WX_MNG_MBOX_CTL_FWRDY, 1000, timeout * 1000, false, wx, WX_MNG_MBOX_CTL); + buf[0] = rd32(wx, WX_MNG_MBOX); + if ((buf[0] & 0xff0000) >> 16 == 0x80) { + wx_err(wx, "Unknown FW command: 0x%x\n", buffer[0] & 0xff); + status = -EINVAL; + goto rel_out; + } + /* Check command completion */ if (status) { - wx_dbg(wx, "Command has failed with no status valid.\n"); - - buf[0] = rd32(wx, WX_MNG_MBOX); - if ((buffer[0] & 0xff) != (~buf[0] >> 24)) { - status = -EINVAL; - goto rel_out; - } - if ((buf[0] & 0xff0000) >> 16 == 0x80) { - wx_dbg(wx, "It's unknown cmd.\n"); - status = -EINVAL; - goto rel_out; - } - + wx_err(wx, "Command has failed with no status valid.\n"); wx_dbg(wx, "write value:\n"); for (i = 0; i < dword_len; i++) wx_dbg(wx, "%x ", buffer[i]); wx_dbg(wx, "read value:\n"); for (i = 0; i < dword_len; i++) wx_dbg(wx, "%x ", buf[i]); + wx_dbg(wx, "\ncheck: %x %x\n", buffer[0] & 0xff, ~buf[0] >> 24); + + goto rel_out; } if (!return_data) diff --git a/drivers/net/ieee802154/ca8210.c b/drivers/net/ieee802154/ca8210.c index e685a7f946f0..753215ebc67c 100644 --- a/drivers/net/ieee802154/ca8210.c +++ b/drivers/net/ieee802154/ca8210.c @@ -3072,7 +3072,11 @@ static int ca8210_probe(struct spi_device *spi_device) spi_set_drvdata(priv->spi, priv); if (IS_ENABLED(CONFIG_IEEE802154_CA8210_DEBUGFS)) { cascoda_api_upstream = ca8210_test_int_driver_write; - ca8210_test_interface_init(priv); + ret = ca8210_test_interface_init(priv); + if (ret) { + dev_crit(&spi_device->dev, "ca8210_test_interface_init failed\n"); + goto error; + } } else { cascoda_api_upstream = NULL; } diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index ee2c3cf4df36..da3a97a65507 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -799,6 +799,12 @@ static int ipvlan_device_event(struct notifier_block *unused, case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlying device to change its type. */ return NOTIFY_BAD; + + case NETDEV_NOTIFY_PEERS: + case NETDEV_BONDING_FAILOVER: + case NETDEV_RESEND_IGMP: + list_for_each_entry(ipvlan, &port->ipvlans, pnode) + call_netdevice_notifiers(event, ipvlan->dev); } return NOTIFY_DONE; } diff --git a/drivers/net/mctp/mctp-i3c.c b/drivers/net/mctp/mctp-i3c.c index 9adad59b8676..d247fe483c58 100644 --- a/drivers/net/mctp/mctp-i3c.c +++ b/drivers/net/mctp/mctp-i3c.c @@ -125,6 +125,8 @@ static int mctp_i3c_read(struct mctp_i3c_device *mi) xfer.data.in = skb_put(skb, mi->mrl); + /* Make sure netif_rx() is read in the same order as i3c. */ + mutex_lock(&mi->lock); rc = i3c_device_do_priv_xfers(mi->i3c, &xfer, 1); if (rc < 0) goto err; @@ -166,8 +168,10 @@ static int mctp_i3c_read(struct mctp_i3c_device *mi) stats->rx_dropped++; } + mutex_unlock(&mi->lock); return 0; err: + mutex_unlock(&mi->lock); kfree_skb(skb); return rc; } diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c index f422a2f666ef..86ab4a42769a 100644 --- a/drivers/net/netconsole.c +++ b/drivers/net/netconsole.c @@ -730,7 +730,7 @@ static void update_userdata(struct netconsole_target *nt) struct userdatum *udm_item; struct config_item *item; - if (child_count >= MAX_USERDATA_ITEMS) + if (WARN_ON_ONCE(child_count >= MAX_USERDATA_ITEMS)) break; child_count++; diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index e068a9761c09..d013b6498539 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -20,6 +20,7 @@ #include <linux/netdevice.h> #include <linux/slab.h> #include <net/netdev_queues.h> +#include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> #include <net/netlink.h> #include <net/net_shaper.h> @@ -29,6 +30,8 @@ #include "netdevsim.h" +MODULE_IMPORT_NS("NETDEV_INTERNAL"); + #define NSIM_RING_SIZE 256 static int nsim_napi_rx(struct nsim_rq *rq, struct sk_buff *skb) @@ -69,7 +72,7 @@ static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev) rxq = skb_get_queue_mapping(skb); if (rxq >= peer_dev->num_rx_queues) rxq = rxq % peer_dev->num_rx_queues; - rq = &peer_ns->rq[rxq]; + rq = peer_ns->rq[rxq]; skb_tx_timestamp(skb); if (unlikely(nsim_forward_skb(peer_dev, skb, rq) == NET_RX_DROP)) @@ -359,25 +362,24 @@ static int nsim_poll(struct napi_struct *napi, int budget) return done; } -static int nsim_create_page_pool(struct nsim_rq *rq) +static int nsim_create_page_pool(struct page_pool **p, struct napi_struct *napi) { - struct page_pool_params p = { + struct page_pool_params params = { .order = 0, .pool_size = NSIM_RING_SIZE, .nid = NUMA_NO_NODE, - .dev = &rq->napi.dev->dev, - .napi = &rq->napi, + .dev = &napi->dev->dev, + .napi = napi, .dma_dir = DMA_BIDIRECTIONAL, - .netdev = rq->napi.dev, + .netdev = napi->dev, }; + struct page_pool *pool; - rq->page_pool = page_pool_create(&p); - if (IS_ERR(rq->page_pool)) { - int err = PTR_ERR(rq->page_pool); + pool = page_pool_create(¶ms); + if (IS_ERR(pool)) + return PTR_ERR(pool); - rq->page_pool = NULL; - return err; - } + *p = pool; return 0; } @@ -388,15 +390,15 @@ static int nsim_init_napi(struct netdevsim *ns) int err, i; for (i = 0; i < dev->num_rx_queues; i++) { - rq = &ns->rq[i]; + rq = ns->rq[i]; - netif_napi_add(dev, &rq->napi, nsim_poll); + netif_napi_add_config(dev, &rq->napi, nsim_poll, i); } for (i = 0; i < dev->num_rx_queues; i++) { - rq = &ns->rq[i]; + rq = ns->rq[i]; - err = nsim_create_page_pool(rq); + err = nsim_create_page_pool(&rq->page_pool, &rq->napi); if (err) goto err_pp_destroy; } @@ -405,12 +407,12 @@ static int nsim_init_napi(struct netdevsim *ns) err_pp_destroy: while (i--) { - page_pool_destroy(ns->rq[i].page_pool); - ns->rq[i].page_pool = NULL; + page_pool_destroy(ns->rq[i]->page_pool); + ns->rq[i]->page_pool = NULL; } for (i = 0; i < dev->num_rx_queues; i++) - __netif_napi_del(&ns->rq[i].napi); + __netif_napi_del(&ns->rq[i]->napi); return err; } @@ -421,7 +423,7 @@ static void nsim_enable_napi(struct netdevsim *ns) int i; for (i = 0; i < dev->num_rx_queues; i++) { - struct nsim_rq *rq = &ns->rq[i]; + struct nsim_rq *rq = ns->rq[i]; netif_queue_set_napi(dev, i, NETDEV_QUEUE_TYPE_RX, &rq->napi); napi_enable(&rq->napi); @@ -448,7 +450,7 @@ static void nsim_del_napi(struct netdevsim *ns) int i; for (i = 0; i < dev->num_rx_queues; i++) { - struct nsim_rq *rq = &ns->rq[i]; + struct nsim_rq *rq = ns->rq[i]; napi_disable(&rq->napi); __netif_napi_del(&rq->napi); @@ -456,8 +458,8 @@ static void nsim_del_napi(struct netdevsim *ns) synchronize_net(); for (i = 0; i < dev->num_rx_queues; i++) { - page_pool_destroy(ns->rq[i].page_pool); - ns->rq[i].page_pool = NULL; + page_pool_destroy(ns->rq[i]->page_pool); + ns->rq[i]->page_pool = NULL; } } @@ -595,6 +597,182 @@ static const struct netdev_stat_ops nsim_stat_ops = { .get_base_stats = nsim_get_base_stats, }; +static struct nsim_rq *nsim_queue_alloc(void) +{ + struct nsim_rq *rq; + + rq = kzalloc(sizeof(*rq), GFP_KERNEL_ACCOUNT); + if (!rq) + return NULL; + + skb_queue_head_init(&rq->skb_queue); + return rq; +} + +static void nsim_queue_free(struct nsim_rq *rq) +{ + skb_queue_purge_reason(&rq->skb_queue, SKB_DROP_REASON_QUEUE_PURGE); + kfree(rq); +} + +/* Queue reset mode is controlled by ns->rq_reset_mode. + * - normal - new NAPI new pool (old NAPI enabled when new added) + * - mode 1 - allocate new pool (NAPI is only disabled / enabled) + * - mode 2 - new NAPI new pool (old NAPI removed before new added) + * - mode 3 - new NAPI new pool (old NAPI disabled when new added) + */ +struct nsim_queue_mem { + struct nsim_rq *rq; + struct page_pool *pp; +}; + +static int +nsim_queue_mem_alloc(struct net_device *dev, void *per_queue_mem, int idx) +{ + struct nsim_queue_mem *qmem = per_queue_mem; + struct netdevsim *ns = netdev_priv(dev); + int err; + + if (ns->rq_reset_mode > 3) + return -EINVAL; + + if (ns->rq_reset_mode == 1) + return nsim_create_page_pool(&qmem->pp, &ns->rq[idx]->napi); + + qmem->rq = nsim_queue_alloc(); + if (!qmem->rq) + return -ENOMEM; + + err = nsim_create_page_pool(&qmem->rq->page_pool, &qmem->rq->napi); + if (err) + goto err_free; + + if (!ns->rq_reset_mode) + netif_napi_add_config(dev, &qmem->rq->napi, nsim_poll, idx); + + return 0; + +err_free: + nsim_queue_free(qmem->rq); + return err; +} + +static void nsim_queue_mem_free(struct net_device *dev, void *per_queue_mem) +{ + struct nsim_queue_mem *qmem = per_queue_mem; + struct netdevsim *ns = netdev_priv(dev); + + page_pool_destroy(qmem->pp); + if (qmem->rq) { + if (!ns->rq_reset_mode) + netif_napi_del(&qmem->rq->napi); + page_pool_destroy(qmem->rq->page_pool); + nsim_queue_free(qmem->rq); + } +} + +static int +nsim_queue_start(struct net_device *dev, void *per_queue_mem, int idx) +{ + struct nsim_queue_mem *qmem = per_queue_mem; + struct netdevsim *ns = netdev_priv(dev); + + if (ns->rq_reset_mode == 1) { + ns->rq[idx]->page_pool = qmem->pp; + napi_enable(&ns->rq[idx]->napi); + return 0; + } + + /* netif_napi_add()/_del() should normally be called from alloc/free, + * here we want to test various call orders. + */ + if (ns->rq_reset_mode == 2) { + netif_napi_del(&ns->rq[idx]->napi); + netif_napi_add_config(dev, &qmem->rq->napi, nsim_poll, idx); + } else if (ns->rq_reset_mode == 3) { + netif_napi_add_config(dev, &qmem->rq->napi, nsim_poll, idx); + netif_napi_del(&ns->rq[idx]->napi); + } + + ns->rq[idx] = qmem->rq; + napi_enable(&ns->rq[idx]->napi); + + return 0; +} + +static int nsim_queue_stop(struct net_device *dev, void *per_queue_mem, int idx) +{ + struct nsim_queue_mem *qmem = per_queue_mem; + struct netdevsim *ns = netdev_priv(dev); + + napi_disable(&ns->rq[idx]->napi); + + if (ns->rq_reset_mode == 1) { + qmem->pp = ns->rq[idx]->page_pool; + page_pool_disable_direct_recycling(qmem->pp); + } else { + qmem->rq = ns->rq[idx]; + } + + return 0; +} + +static const struct netdev_queue_mgmt_ops nsim_queue_mgmt_ops = { + .ndo_queue_mem_size = sizeof(struct nsim_queue_mem), + .ndo_queue_mem_alloc = nsim_queue_mem_alloc, + .ndo_queue_mem_free = nsim_queue_mem_free, + .ndo_queue_start = nsim_queue_start, + .ndo_queue_stop = nsim_queue_stop, +}; + +static ssize_t +nsim_qreset_write(struct file *file, const char __user *data, + size_t count, loff_t *ppos) +{ + struct netdevsim *ns = file->private_data; + unsigned int queue, mode; + char buf[32]; + ssize_t ret; + + if (count >= sizeof(buf)) + return -EINVAL; + if (copy_from_user(buf, data, count)) + return -EFAULT; + buf[count] = '\0'; + + ret = sscanf(buf, "%u %u", &queue, &mode); + if (ret != 2) + return -EINVAL; + + rtnl_lock(); + if (!netif_running(ns->netdev)) { + ret = -ENETDOWN; + goto exit_unlock; + } + + if (queue >= ns->netdev->real_num_rx_queues) { + ret = -EINVAL; + goto exit_unlock; + } + + ns->rq_reset_mode = mode; + ret = netdev_rx_queue_restart(ns->netdev, queue); + ns->rq_reset_mode = 0; + if (ret) + goto exit_unlock; + + ret = count; +exit_unlock: + rtnl_unlock(); + return ret; +} + +static const struct file_operations nsim_qreset_fops = { + .open = simple_open, + .write = nsim_qreset_write, + .owner = THIS_MODULE, +}; + static ssize_t nsim_pp_hold_read(struct file *file, char __user *data, size_t count, loff_t *ppos) @@ -628,7 +806,7 @@ nsim_pp_hold_write(struct file *file, const char __user *data, if (!netif_running(ns->netdev) && val) { ret = -ENETDOWN; } else if (val) { - ns->page = page_pool_dev_alloc_pages(ns->rq[0].page_pool); + ns->page = page_pool_dev_alloc_pages(ns->rq[0]->page_pool); if (!ns->page) ret = -ENOMEM; } else { @@ -677,27 +855,35 @@ static int nsim_queue_init(struct netdevsim *ns) struct net_device *dev = ns->netdev; int i; - ns->rq = kvcalloc(dev->num_rx_queues, sizeof(*ns->rq), - GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); + ns->rq = kcalloc(dev->num_rx_queues, sizeof(*ns->rq), + GFP_KERNEL_ACCOUNT); if (!ns->rq) return -ENOMEM; - for (i = 0; i < dev->num_rx_queues; i++) - skb_queue_head_init(&ns->rq[i].skb_queue); + for (i = 0; i < dev->num_rx_queues; i++) { + ns->rq[i] = nsim_queue_alloc(); + if (!ns->rq[i]) + goto err_free_prev; + } return 0; + +err_free_prev: + while (i--) + kfree(ns->rq[i]); + kfree(ns->rq); + return -ENOMEM; } -static void nsim_queue_free(struct netdevsim *ns) +static void nsim_queue_uninit(struct netdevsim *ns) { struct net_device *dev = ns->netdev; int i; for (i = 0; i < dev->num_rx_queues; i++) - skb_queue_purge_reason(&ns->rq[i].skb_queue, - SKB_DROP_REASON_QUEUE_PURGE); + nsim_queue_free(ns->rq[i]); - kvfree(ns->rq); + kfree(ns->rq); ns->rq = NULL; } @@ -713,6 +899,7 @@ static int nsim_init_netdevsim(struct netdevsim *ns) ns->phc = phc; ns->netdev->netdev_ops = &nsim_netdev_ops; ns->netdev->stat_ops = &nsim_stat_ops; + ns->netdev->queue_mgmt_ops = &nsim_queue_mgmt_ops; err = nsim_udp_tunnels_info_create(ns->nsim_dev, ns->netdev); if (err) @@ -741,7 +928,7 @@ err_ipsec_teardown: nsim_macsec_teardown(ns); nsim_bpf_uninit(ns); err_rq_destroy: - nsim_queue_free(ns); + nsim_queue_uninit(ns); err_utn_destroy: rtnl_unlock(); nsim_udp_tunnels_info_destroy(ns->netdev); @@ -798,6 +985,9 @@ nsim_create(struct nsim_dev *nsim_dev, struct nsim_dev_port *nsim_dev_port) ns->pp_dfs = debugfs_create_file("pp_hold", 0600, nsim_dev_port->ddir, ns, &nsim_pp_hold_fops); + ns->qr_dfs = debugfs_create_file("queue_reset", 0200, + nsim_dev_port->ddir, ns, + &nsim_qreset_fops); return ns; @@ -811,6 +1001,7 @@ void nsim_destroy(struct netdevsim *ns) struct net_device *dev = ns->netdev; struct netdevsim *peer; + debugfs_remove(ns->qr_dfs); debugfs_remove(ns->pp_dfs); rtnl_lock(); @@ -823,7 +1014,7 @@ void nsim_destroy(struct netdevsim *ns) nsim_macsec_teardown(ns); nsim_ipsec_teardown(ns); nsim_bpf_uninit(ns); - nsim_queue_free(ns); + nsim_queue_uninit(ns); } rtnl_unlock(); if (nsim_dev_port_is_pf(ns->nsim_dev_port)) diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h index bf02efa10956..a70f62af4c88 100644 --- a/drivers/net/netdevsim/netdevsim.h +++ b/drivers/net/netdevsim/netdevsim.h @@ -101,7 +101,9 @@ struct netdevsim { struct nsim_dev *nsim_dev; struct nsim_dev_port *nsim_dev_port; struct mock_phc *phc; - struct nsim_rq *rq; + struct nsim_rq **rq; + + int rq_reset_mode; u64 tx_packets; u64 tx_bytes; @@ -134,6 +136,7 @@ struct netdevsim { struct page *page; struct dentry *pp_dfs; + struct dentry *qr_dfs; struct nsim_ethtool ethtool; struct netdevsim __rcu *peer; diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index c1d881dc6409..1e1b00756be7 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -338,6 +338,7 @@ static int netkit_new_link(struct net *peer_net, struct net_device *dev, enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT; enum netkit_mode mode = NETKIT_L3; unsigned char ifname_assign_type; + u16 headroom = 0, tailroom = 0; struct ifinfomsg *ifmp = NULL; struct net_device *peer; char ifname[IFNAMSIZ]; @@ -371,6 +372,10 @@ static int netkit_new_link(struct net *peer_net, struct net_device *dev, if (err < 0) return err; } + if (data[IFLA_NETKIT_HEADROOM]) + headroom = nla_get_u16(data[IFLA_NETKIT_HEADROOM]); + if (data[IFLA_NETKIT_TAILROOM]) + tailroom = nla_get_u16(data[IFLA_NETKIT_TAILROOM]); } if (ifmp && tbp[IFLA_IFNAME]) { @@ -390,6 +395,14 @@ static int netkit_new_link(struct net *peer_net, struct net_device *dev, return PTR_ERR(peer); netif_inherit_tso_max(peer, dev); + if (headroom) { + peer->needed_headroom = headroom; + dev->needed_headroom = headroom; + } + if (tailroom) { + peer->needed_tailroom = tailroom; + dev->needed_tailroom = tailroom; + } if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS])) eth_hw_addr_random(peer); @@ -401,6 +414,7 @@ static int netkit_new_link(struct net *peer_net, struct net_device *dev, nk->policy = policy_peer; nk->scrub = scrub_peer; nk->mode = mode; + nk->headroom = headroom; bpf_mprog_bundle_init(&nk->bundle); err = register_netdevice(peer); @@ -426,6 +440,7 @@ static int netkit_new_link(struct net *peer_net, struct net_device *dev, nk->policy = policy_prim; nk->scrub = scrub_prim; nk->mode = mode; + nk->headroom = headroom; bpf_mprog_bundle_init(&nk->bundle); err = register_netdevice(dev); @@ -850,7 +865,18 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], struct net_device *peer = rtnl_dereference(nk->peer); enum netkit_action policy; struct nlattr *attr; - int err; + int err, i; + static const struct { + u32 attr; + char *name; + } fixed_params[] = { + { IFLA_NETKIT_MODE, "operating mode" }, + { IFLA_NETKIT_SCRUB, "scrubbing" }, + { IFLA_NETKIT_PEER_SCRUB, "peer scrubbing" }, + { IFLA_NETKIT_PEER_INFO, "peer info" }, + { IFLA_NETKIT_HEADROOM, "headroom" }, + { IFLA_NETKIT_TAILROOM, "tailroom" }, + }; if (!nk->primary) { NL_SET_ERR_MSG(extack, @@ -858,28 +884,14 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], return -EACCES; } - if (data[IFLA_NETKIT_MODE]) { - NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_MODE], - "netkit link operating mode cannot be changed after device creation"); - return -EACCES; - } - - if (data[IFLA_NETKIT_SCRUB]) { - NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_SCRUB], - "netkit scrubbing cannot be changed after device creation"); - return -EACCES; - } - - if (data[IFLA_NETKIT_PEER_SCRUB]) { - NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_PEER_SCRUB], - "netkit scrubbing cannot be changed after device creation"); - return -EACCES; - } - - if (data[IFLA_NETKIT_PEER_INFO]) { - NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_PEER_INFO], - "netkit peer info cannot be changed after device creation"); - return -EINVAL; + for (i = 0; i < ARRAY_SIZE(fixed_params); i++) { + attr = data[fixed_params[i].attr]; + if (attr) { + NL_SET_ERR_MSG_ATTR_FMT(extack, attr, + "netkit link %s cannot be changed after device creation", + fixed_params[i].name); + return -EACCES; + } } if (data[IFLA_NETKIT_POLICY]) { @@ -914,6 +926,8 @@ static size_t netkit_get_size(const struct net_device *dev) nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PEER_SCRUB */ nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_MODE */ nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */ + nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_HEADROOM */ + nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_TAILROOM */ 0; } @@ -930,6 +944,10 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev) return -EMSGSIZE; if (nla_put_u32(skb, IFLA_NETKIT_SCRUB, nk->scrub)) return -EMSGSIZE; + if (nla_put_u16(skb, IFLA_NETKIT_HEADROOM, dev->needed_headroom)) + return -EMSGSIZE; + if (nla_put_u16(skb, IFLA_NETKIT_TAILROOM, dev->needed_tailroom)) + return -EMSGSIZE; if (peer) { nk = netkit_priv(peer); @@ -947,6 +965,8 @@ static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = { [IFLA_NETKIT_MODE] = NLA_POLICY_MAX(NLA_U32, NETKIT_L3), [IFLA_NETKIT_POLICY] = { .type = NLA_U32 }, [IFLA_NETKIT_PEER_POLICY] = { .type = NLA_U32 }, + [IFLA_NETKIT_HEADROOM] = { .type = NLA_U16 }, + [IFLA_NETKIT_TAILROOM] = { .type = NLA_U16 }, [IFLA_NETKIT_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), [IFLA_NETKIT_PEER_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), [IFLA_NETKIT_PRIMARY] = { .type = NLA_REJECT, diff --git a/drivers/net/pcs/pcs-lynx.c b/drivers/net/pcs/pcs-lynx.c index 767a8c0714ac..6457190ec6e7 100644 --- a/drivers/net/pcs/pcs-lynx.c +++ b/drivers/net/pcs/pcs-lynx.c @@ -334,9 +334,19 @@ static const struct phylink_pcs_ops lynx_pcs_phylink_ops = { .pcs_link_up = lynx_pcs_link_up, }; +static const phy_interface_t lynx_interfaces[] = { + PHY_INTERFACE_MODE_SGMII, + PHY_INTERFACE_MODE_QSGMII, + PHY_INTERFACE_MODE_1000BASEX, + PHY_INTERFACE_MODE_2500BASEX, + PHY_INTERFACE_MODE_10GBASER, + PHY_INTERFACE_MODE_USXGMII, +}; + static struct phylink_pcs *lynx_pcs_create(struct mdio_device *mdio) { struct lynx_pcs *lynx; + int i; lynx = kzalloc(sizeof(*lynx), GFP_KERNEL); if (!lynx) @@ -348,6 +358,9 @@ static struct phylink_pcs *lynx_pcs_create(struct mdio_device *mdio) lynx->pcs.neg_mode = true; lynx->pcs.poll = true; + for (i = 0; i < ARRAY_SIZE(lynx_interfaces); i++) + __set_bit(lynx_interfaces[i], lynx->pcs.supported_interfaces); + return lynx_to_phylink_pcs(lynx); } diff --git a/drivers/net/pcs/pcs-mtk-lynxi.c b/drivers/net/pcs/pcs-mtk-lynxi.c index ed91cd7a406a..4fe0fb6d12a4 100644 --- a/drivers/net/pcs/pcs-mtk-lynxi.c +++ b/drivers/net/pcs/pcs-mtk-lynxi.c @@ -307,6 +307,10 @@ struct phylink_pcs *mtk_pcs_lynxi_create(struct device *dev, mpcs->pcs.poll = true; mpcs->interface = PHY_INTERFACE_MODE_NA; + __set_bit(PHY_INTERFACE_MODE_SGMII, mpcs->pcs.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_1000BASEX, mpcs->pcs.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_2500BASEX, mpcs->pcs.supported_interfaces); + return &mpcs->pcs; } EXPORT_SYMBOL(mtk_pcs_lynxi_create); diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c index f70ca39f0905..c06b66f40022 100644 --- a/drivers/net/pcs/pcs-xpcs.c +++ b/drivers/net/pcs/pcs-xpcs.c @@ -594,14 +594,13 @@ static unsigned int xpcs_inband_caps(struct phylink_pcs *pcs, } } -void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces) +static void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces) { const struct dw_xpcs_compat *compat; for (compat = xpcs->desc->compat; compat->supported; compat++) __set_bit(compat->interface, interfaces); } -EXPORT_SYMBOL_GPL(xpcs_get_interfaces); int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable) { @@ -1446,6 +1445,8 @@ static struct dw_xpcs *xpcs_create(struct mdio_device *mdiodev) if (ret) goto out_clear_clks; + xpcs_get_interfaces(xpcs, xpcs->pcs.supported_interfaces); + if (xpcs->info.pma == WX_TXGBE_XPCS_PMA_10G_ID) xpcs->pcs.poll = false; else diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index dc625f2b3ae4..9ad3dbfd2f99 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -287,8 +287,8 @@ config MICROCHIP_PHY config MICROCHIP_T1_PHY tristate "Microchip T1 PHYs" - select MICROCHIP_PHY_RDS_PTP if NETWORK_PHY_TIMESTAMPING && \ - PTP_1588_CLOCK_OPTIONAL + select MICROCHIP_PHY_RDS_PTP if NETWORK_PHY_TIMESTAMPING + depends on PTP_1588_CLOCK_OPTIONAL help Supports the LAN8XXX PHYs. diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index 334c17a68edd..4262bc31503b 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -30,6 +30,9 @@ #define MII_DP83822_FCSCR 0x14 #define MII_DP83822_RCSR 0x17 #define MII_DP83822_RESET_CTRL 0x1f +#define MII_DP83822_MLEDCR 0x25 +#define MII_DP83822_LEDCFG1 0x460 +#define MII_DP83822_IOCTRL1 0x462 #define MII_DP83822_IOCTRL2 0x463 #define MII_DP83822_GENCFG 0x465 #define MII_DP83822_SOR1 0x467 @@ -105,10 +108,26 @@ #define DP83822_RX_CLK_SHIFT BIT(12) #define DP83822_TX_CLK_SHIFT BIT(11) +/* MLEDCR bits */ +#define DP83822_MLEDCR_CFG GENMASK(6, 3) +#define DP83822_MLEDCR_ROUTE GENMASK(1, 0) +#define DP83822_MLEDCR_ROUTE_LED_0 DP83822_MLEDCR_ROUTE + +/* LEDCFG1 bits */ +#define DP83822_LEDCFG1_LED1_CTRL GENMASK(11, 8) +#define DP83822_LEDCFG1_LED3_CTRL GENMASK(7, 4) + +/* IOCTRL1 bits */ +#define DP83822_IOCTRL1_GPIO3_CTRL GENMASK(10, 8) +#define DP83822_IOCTRL1_GPIO3_CTRL_LED3 BIT(0) +#define DP83822_IOCTRL1_GPIO1_CTRL GENMASK(2, 0) +#define DP83822_IOCTRL1_GPIO1_CTRL_LED_1 BIT(0) + /* IOCTRL2 bits */ #define DP83822_IOCTRL2_GPIO2_CLK_SRC GENMASK(6, 4) #define DP83822_IOCTRL2_GPIO2_CTRL GENMASK(2, 0) #define DP83822_IOCTRL2_GPIO2_CTRL_CLK_REF GENMASK(1, 0) +#define DP83822_IOCTRL2_GPIO2_CTRL_MLED BIT(0) #define DP83822_CLK_SRC_MAC_IF 0x0 #define DP83822_CLK_SRC_XI 0x1 @@ -117,6 +136,22 @@ #define DP83822_CLK_SRC_FREE_RUNNING 0x6 #define DP83822_CLK_SRC_RECOVERED 0x7 +#define DP83822_LED_FN_LINK 0x0 /* Link established */ +#define DP83822_LED_FN_RX_TX 0x1 /* Receive or Transmit activity */ +#define DP83822_LED_FN_TX 0x2 /* Transmit activity */ +#define DP83822_LED_FN_RX 0x3 /* Receive activity */ +#define DP83822_LED_FN_COLLISION 0x4 /* Collision detected */ +#define DP83822_LED_FN_LINK_100_BTX 0x5 /* 100 BTX link established */ +#define DP83822_LED_FN_LINK_10_BT 0x6 /* 10BT link established */ +#define DP83822_LED_FN_FULL_DUPLEX 0x7 /* Full duplex */ +#define DP83822_LED_FN_LINK_RX_TX 0x8 /* Link established, blink for rx or tx activity */ +#define DP83822_LED_FN_ACTIVE_STRETCH 0x9 /* Active Stretch Signal */ +#define DP83822_LED_FN_MII_LINK 0xa /* MII LINK (100BT+FD) */ +#define DP83822_LED_FN_LPI_MODE 0xb /* LPI Mode (EEE) */ +#define DP83822_LED_FN_RX_TX_ERR 0xc /* TX/RX MII Error */ +#define DP83822_LED_FN_LINK_LOST 0xd /* Link Lost */ +#define DP83822_LED_FN_PRBS_ERR 0xe /* Blink for PRBS error */ + /* SOR1 mode */ #define DP83822_STRAP_MODE1 0 #define DP83822_STRAP_MODE2 BIT(0) @@ -145,6 +180,13 @@ ADVERTISED_FIBRE | \ ADVERTISED_Pause | ADVERTISED_Asym_Pause) +#define DP83822_MAX_LED_PINS 4 + +#define DP83822_LED_INDEX_LED_0 0 +#define DP83822_LED_INDEX_LED_1_GPIO1 1 +#define DP83822_LED_INDEX_COL_GPIO2 2 +#define DP83822_LED_INDEX_RX_D3_GPIO3 3 + struct dp83822_private { bool fx_signal_det_low; int fx_enabled; @@ -154,6 +196,7 @@ struct dp83822_private { struct ethtool_wolinfo wol; bool set_gpio2_clk_out; u32 gpio2_clk_out; + bool led_pin_enable[DP83822_MAX_LED_PINS]; }; static int dp83822_config_wol(struct phy_device *phydev, @@ -418,6 +461,48 @@ static int dp83822_read_status(struct phy_device *phydev) return 0; } +static int dp83822_config_init_leds(struct phy_device *phydev) +{ + struct dp83822_private *dp83822 = phydev->priv; + int ret; + + if (dp83822->led_pin_enable[DP83822_LED_INDEX_LED_0]) { + ret = phy_modify_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_MLEDCR, + DP83822_MLEDCR_ROUTE, + FIELD_PREP(DP83822_MLEDCR_ROUTE, + DP83822_MLEDCR_ROUTE_LED_0)); + if (ret) + return ret; + } else if (dp83822->led_pin_enable[DP83822_LED_INDEX_COL_GPIO2]) { + ret = phy_modify_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_IOCTRL2, + DP83822_IOCTRL2_GPIO2_CTRL, + FIELD_PREP(DP83822_IOCTRL2_GPIO2_CTRL, + DP83822_IOCTRL2_GPIO2_CTRL_MLED)); + if (ret) + return ret; + } + + if (dp83822->led_pin_enable[DP83822_LED_INDEX_LED_1_GPIO1]) { + ret = phy_modify_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_IOCTRL1, + DP83822_IOCTRL1_GPIO1_CTRL, + FIELD_PREP(DP83822_IOCTRL1_GPIO1_CTRL, + DP83822_IOCTRL1_GPIO1_CTRL_LED_1)); + if (ret) + return ret; + } + + if (dp83822->led_pin_enable[DP83822_LED_INDEX_RX_D3_GPIO3]) { + ret = phy_modify_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_IOCTRL1, + DP83822_IOCTRL1_GPIO3_CTRL, + FIELD_PREP(DP83822_IOCTRL1_GPIO3_CTRL, + DP83822_IOCTRL1_GPIO3_CTRL_LED3)); + if (ret) + return ret; + } + + return 0; +} + static int dp83822_config_init(struct phy_device *phydev) { struct dp83822_private *dp83822 = phydev->priv; @@ -437,6 +522,10 @@ static int dp83822_config_init(struct phy_device *phydev) FIELD_PREP(DP83822_IOCTRL2_GPIO2_CLK_SRC, dp83822->gpio2_clk_out)); + err = dp83822_config_init_leds(phydev); + if (err) + return err; + if (phy_interface_is_rgmii(phydev)) { rx_int_delay = phy_get_internal_delay(phydev, dev, NULL, 0, true); @@ -631,6 +720,61 @@ static int dp83822_phy_reset(struct phy_device *phydev) } #ifdef CONFIG_OF_MDIO +static int dp83822_of_init_leds(struct phy_device *phydev) +{ + struct device_node *node = phydev->mdio.dev.of_node; + struct dp83822_private *dp83822 = phydev->priv; + struct device_node *leds; + u32 index; + int err; + + if (!node) + return 0; + + leds = of_get_child_by_name(node, "leds"); + if (!leds) + return 0; + + for_each_available_child_of_node_scoped(leds, led) { + err = of_property_read_u32(led, "reg", &index); + if (err) { + of_node_put(leds); + return err; + } + + if (index <= DP83822_LED_INDEX_RX_D3_GPIO3) { + dp83822->led_pin_enable[index] = true; + } else { + of_node_put(leds); + return -EINVAL; + } + } + + of_node_put(leds); + /* LED_0 and COL(GPIO2) use the MLED function. MLED can be routed to + * only one of these two pins at a time. + */ + if (dp83822->led_pin_enable[DP83822_LED_INDEX_LED_0] && + dp83822->led_pin_enable[DP83822_LED_INDEX_COL_GPIO2]) { + phydev_err(phydev, "LED_0 and COL(GPIO2) cannot be used as LED output at the same time\n"); + return -EINVAL; + } + + if (dp83822->led_pin_enable[DP83822_LED_INDEX_COL_GPIO2] && + dp83822->set_gpio2_clk_out) { + phydev_err(phydev, "COL(GPIO2) cannot be used as LED outout, already used as clock output\n"); + return -EINVAL; + } + + if (dp83822->led_pin_enable[DP83822_LED_INDEX_RX_D3_GPIO3] && + phydev->interface != PHY_INTERFACE_MODE_RMII) { + phydev_err(phydev, "RX_D3 can only be used as LED output when in RMII mode\n"); + return -EINVAL; + } + + return 0; +} + static int dp83822_of_init(struct phy_device *phydev) { struct dp83822_private *dp83822 = phydev->priv; @@ -671,7 +815,7 @@ static int dp83822_of_init(struct phy_device *phydev) dp83822->set_gpio2_clk_out = true; } - return 0; + return dp83822_of_init_leds(phydev); } static int dp83826_to_dac_minus_one_regval(int percent) @@ -769,7 +913,9 @@ static int dp83822_probe(struct phy_device *phydev) if (ret) return ret; - dp83822_of_init(phydev); + ret = dp83822_of_init(phydev); + if (ret) + return ret; if (dp83822->fx_enabled) phydev->port = PORT_FIBRE; @@ -816,6 +962,130 @@ static int dp83822_resume(struct phy_device *phydev) return 0; } +static int dp83822_led_mode(u8 index, unsigned long rules) +{ + switch (rules) { + case BIT(TRIGGER_NETDEV_LINK): + return DP83822_LED_FN_LINK; + case BIT(TRIGGER_NETDEV_LINK_10): + return DP83822_LED_FN_LINK_10_BT; + case BIT(TRIGGER_NETDEV_LINK_100): + return DP83822_LED_FN_LINK_100_BTX; + case BIT(TRIGGER_NETDEV_FULL_DUPLEX): + return DP83822_LED_FN_FULL_DUPLEX; + case BIT(TRIGGER_NETDEV_TX): + return DP83822_LED_FN_TX; + case BIT(TRIGGER_NETDEV_RX): + return DP83822_LED_FN_RX; + case BIT(TRIGGER_NETDEV_TX) | BIT(TRIGGER_NETDEV_RX): + return DP83822_LED_FN_RX_TX; + case BIT(TRIGGER_NETDEV_TX_ERR) | BIT(TRIGGER_NETDEV_RX_ERR): + return DP83822_LED_FN_RX_TX_ERR; + case BIT(TRIGGER_NETDEV_LINK) | BIT(TRIGGER_NETDEV_TX) | BIT(TRIGGER_NETDEV_RX): + return DP83822_LED_FN_LINK_RX_TX; + default: + return -EOPNOTSUPP; + } +} + +static int dp83822_led_hw_is_supported(struct phy_device *phydev, u8 index, + unsigned long rules) +{ + int mode; + + mode = dp83822_led_mode(index, rules); + if (mode < 0) + return mode; + + return 0; +} + +static int dp83822_led_hw_control_set(struct phy_device *phydev, u8 index, + unsigned long rules) +{ + int mode; + + mode = dp83822_led_mode(index, rules); + if (mode < 0) + return mode; + + if (index == DP83822_LED_INDEX_LED_0 || index == DP83822_LED_INDEX_COL_GPIO2) + return phy_modify_mmd(phydev, MDIO_MMD_VEND2, + MII_DP83822_MLEDCR, DP83822_MLEDCR_CFG, + FIELD_PREP(DP83822_MLEDCR_CFG, mode)); + else if (index == DP83822_LED_INDEX_LED_1_GPIO1) + return phy_modify_mmd(phydev, MDIO_MMD_VEND2, + MII_DP83822_LEDCFG1, + DP83822_LEDCFG1_LED1_CTRL, + FIELD_PREP(DP83822_LEDCFG1_LED1_CTRL, + mode)); + else + return phy_modify_mmd(phydev, MDIO_MMD_VEND2, + MII_DP83822_LEDCFG1, + DP83822_LEDCFG1_LED3_CTRL, + FIELD_PREP(DP83822_LEDCFG1_LED3_CTRL, + mode)); +} + +static int dp83822_led_hw_control_get(struct phy_device *phydev, u8 index, + unsigned long *rules) +{ + int val; + + if (index == DP83822_LED_INDEX_LED_0 || index == DP83822_LED_INDEX_COL_GPIO2) { + val = phy_read_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_MLEDCR); + if (val < 0) + return val; + + val = FIELD_GET(DP83822_MLEDCR_CFG, val); + } else { + val = phy_read_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_LEDCFG1); + if (val < 0) + return val; + + if (index == DP83822_LED_INDEX_LED_1_GPIO1) + val = FIELD_GET(DP83822_LEDCFG1_LED1_CTRL, val); + else + val = FIELD_GET(DP83822_LEDCFG1_LED3_CTRL, val); + } + + switch (val) { + case DP83822_LED_FN_LINK: + *rules = BIT(TRIGGER_NETDEV_LINK); + break; + case DP83822_LED_FN_LINK_10_BT: + *rules = BIT(TRIGGER_NETDEV_LINK_10); + break; + case DP83822_LED_FN_LINK_100_BTX: + *rules = BIT(TRIGGER_NETDEV_LINK_100); + break; + case DP83822_LED_FN_FULL_DUPLEX: + *rules = BIT(TRIGGER_NETDEV_FULL_DUPLEX); + break; + case DP83822_LED_FN_TX: + *rules = BIT(TRIGGER_NETDEV_TX); + break; + case DP83822_LED_FN_RX: + *rules = BIT(TRIGGER_NETDEV_RX); + break; + case DP83822_LED_FN_RX_TX: + *rules = BIT(TRIGGER_NETDEV_TX) | BIT(TRIGGER_NETDEV_RX); + break; + case DP83822_LED_FN_RX_TX_ERR: + *rules = BIT(TRIGGER_NETDEV_TX_ERR) | BIT(TRIGGER_NETDEV_RX_ERR); + break; + case DP83822_LED_FN_LINK_RX_TX: + *rules = BIT(TRIGGER_NETDEV_LINK) | BIT(TRIGGER_NETDEV_TX) | + BIT(TRIGGER_NETDEV_RX); + break; + default: + *rules = 0; + break; + } + + return 0; +} + #define DP83822_PHY_DRIVER(_id, _name) \ { \ PHY_ID_MATCH_MODEL(_id), \ @@ -831,6 +1101,9 @@ static int dp83822_resume(struct phy_device *phydev) .handle_interrupt = dp83822_handle_interrupt, \ .suspend = dp83822_suspend, \ .resume = dp83822_resume, \ + .led_hw_is_supported = dp83822_led_hw_is_supported, \ + .led_hw_control_set = dp83822_led_hw_control_set, \ + .led_hw_control_get = dp83822_led_hw_control_get, \ } #define DP83825_PHY_DRIVER(_id, _name) \ diff --git a/drivers/net/phy/dp83td510.c b/drivers/net/phy/dp83td510.c index 92aa3a2b9744..1abeacca7441 100644 --- a/drivers/net/phy/dp83td510.c +++ b/drivers/net/phy/dp83td510.c @@ -34,6 +34,29 @@ #define DP83TD510E_CTRL_HW_RESET BIT(15) #define DP83TD510E_CTRL_SW_RESET BIT(14) +/* + * DP83TD510E_PKT_STAT_x registers correspond to similarly named registers + * in the datasheet (PKT_STAT_1 through PKT_STAT_6). These registers store + * 32-bit or 16-bit counters for TX and RX statistics and must be read in + * sequence to ensure the counters are cleared correctly. + * + * - DP83TD510E_PKT_STAT_1: Contains TX packet count bits [15:0]. + * - DP83TD510E_PKT_STAT_2: Contains TX packet count bits [31:16]. + * - DP83TD510E_PKT_STAT_3: Contains TX error packet count. + * - DP83TD510E_PKT_STAT_4: Contains RX packet count bits [15:0]. + * - DP83TD510E_PKT_STAT_5: Contains RX packet count bits [31:16]. + * - DP83TD510E_PKT_STAT_6: Contains RX error packet count. + * + * Keeping the register names as defined in the datasheet helps maintain + * clarity and alignment with the documentation. + */ +#define DP83TD510E_PKT_STAT_1 0x12b +#define DP83TD510E_PKT_STAT_2 0x12c +#define DP83TD510E_PKT_STAT_3 0x12d +#define DP83TD510E_PKT_STAT_4 0x12e +#define DP83TD510E_PKT_STAT_5 0x12f +#define DP83TD510E_PKT_STAT_6 0x130 + #define DP83TD510E_AN_STAT_1 0x60c #define DP83TD510E_MASTER_SLAVE_RESOL_FAIL BIT(15) @@ -58,8 +81,16 @@ static const u16 dp83td510_mse_sqi_map[] = { 0x0000 /* 24dB =< SNR */ }; +struct dp83td510_stats { + u64 tx_pkt_cnt; + u64 tx_err_pkt_cnt; + u64 rx_pkt_cnt; + u64 rx_err_pkt_cnt; +}; + struct dp83td510_priv { bool alcd_test_active; + struct dp83td510_stats stats; }; /* Time Domain Reflectometry (TDR) Functionality of DP83TD510 PHY @@ -177,6 +208,85 @@ struct dp83td510_priv { #define DP83TD510E_ALCD_COMPLETE BIT(15) #define DP83TD510E_ALCD_CABLE_LENGTH GENMASK(10, 0) +/** + * dp83td510_update_stats - Update the PHY statistics for the DP83TD510 PHY. + * @phydev: Pointer to the phy_device structure. + * + * The function reads the PHY statistics registers and updates the statistics + * structure. + * + * Returns: 0 on success or a negative error code on failure. + */ +static int dp83td510_update_stats(struct phy_device *phydev) +{ + struct dp83td510_priv *priv = phydev->priv; + u32 count; + int ret; + + /* The DP83TD510E_PKT_STAT registers are divided into two groups: + * - Group 1 (TX stats): DP83TD510E_PKT_STAT_1 to DP83TD510E_PKT_STAT_3 + * - Group 2 (RX stats): DP83TD510E_PKT_STAT_4 to DP83TD510E_PKT_STAT_6 + * + * Registers in each group are cleared only after reading them in a + * plain sequence (e.g., 1, 2, 3 for Group 1 or 4, 5, 6 for Group 2). + * Any deviation from the sequence, such as reading 1, 2, 1, 2, 3, will + * prevent the group from being cleared. Additionally, the counters + * for a group are frozen as soon as the first register in that group + * is accessed. + */ + ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TD510E_PKT_STAT_1); + if (ret < 0) + return ret; + /* tx_pkt_cnt_15_0 */ + count = ret; + + ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TD510E_PKT_STAT_2); + if (ret < 0) + return ret; + /* tx_pkt_cnt_31_16 */ + count |= ret << 16; + priv->stats.tx_pkt_cnt += count; + + ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TD510E_PKT_STAT_3); + if (ret < 0) + return ret; + /* tx_err_pkt_cnt */ + priv->stats.tx_err_pkt_cnt += ret; + + ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TD510E_PKT_STAT_4); + if (ret < 0) + return ret; + /* rx_pkt_cnt_15_0 */ + count = ret; + + ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TD510E_PKT_STAT_5); + if (ret < 0) + return ret; + /* rx_pkt_cnt_31_16 */ + count |= ret << 16; + priv->stats.rx_pkt_cnt += count; + + ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TD510E_PKT_STAT_6); + if (ret < 0) + return ret; + /* rx_err_pkt_cnt */ + priv->stats.rx_err_pkt_cnt += ret; + + return 0; +} + +static void dp83td510_get_phy_stats(struct phy_device *phydev, + struct ethtool_eth_phy_stats *eth_stats, + struct ethtool_phy_stats *stats) +{ + struct dp83td510_priv *priv = phydev->priv; + + stats->tx_packets = priv->stats.tx_pkt_cnt; + stats->tx_errors = priv->stats.tx_err_pkt_cnt; + stats->rx_packets = priv->stats.rx_pkt_cnt; + stats->rx_errors = priv->stats.rx_err_pkt_cnt; +} + static int dp83td510_config_intr(struct phy_device *phydev) { int ret; @@ -599,6 +709,8 @@ static struct phy_driver dp83td510_driver[] = { .get_sqi_max = dp83td510_get_sqi_max, .cable_test_start = dp83td510_cable_test_start, .cable_test_get_status = dp83td510_cable_test_get_status, + .get_phy_stats = dp83td510_get_phy_stats, + .update_stats = dp83td510_update_stats, .suspend = genphy_suspend, .resume = genphy_resume, diff --git a/drivers/net/phy/dp83tg720.c b/drivers/net/phy/dp83tg720.c index 0ef4d7dba065..4ea752131b8c 100644 --- a/drivers/net/phy/dp83tg720.c +++ b/drivers/net/phy/dp83tg720.c @@ -51,6 +51,9 @@ /* Register 0x0405: Unknown Register */ #define DP83TG720S_UNKNOWN_0405 0x405 +#define DP83TG720S_LINK_QUAL_3 0x547 +#define DP83TG720S_LINK_LOSS_CNT_MASK GENMASK(15, 10) + /* Register 0x0576: TDR Master Link Down Control */ #define DP83TG720S_TDR_MASTER_LINK_DOWN 0x576 @@ -60,6 +63,29 @@ /* In RGMII mode, Enable or disable the internal delay for TXD */ #define DP83TG720S_RGMII_TX_CLK_SEL BIT(0) +/* + * DP83TG720S_PKT_STAT_x registers correspond to similarly named registers + * in the datasheet (PKT_STAT_1 through PKT_STAT_6). These registers store + * 32-bit or 16-bit counters for TX and RX statistics and must be read in + * sequence to ensure the counters are cleared correctly. + * + * - DP83TG720S_PKT_STAT_1: Contains TX packet count bits [15:0]. + * - DP83TG720S_PKT_STAT_2: Contains TX packet count bits [31:16]. + * - DP83TG720S_PKT_STAT_3: Contains TX error packet count. + * - DP83TG720S_PKT_STAT_4: Contains RX packet count bits [15:0]. + * - DP83TG720S_PKT_STAT_5: Contains RX packet count bits [31:16]. + * - DP83TG720S_PKT_STAT_6: Contains RX error packet count. + * + * Keeping the register names as defined in the datasheet helps maintain + * clarity and alignment with the documentation. + */ +#define DP83TG720S_PKT_STAT_1 0x639 +#define DP83TG720S_PKT_STAT_2 0x63a +#define DP83TG720S_PKT_STAT_3 0x63b +#define DP83TG720S_PKT_STAT_4 0x63c +#define DP83TG720S_PKT_STAT_5 0x63d +#define DP83TG720S_PKT_STAT_6 0x63e + /* Register 0x083F: Unknown Register */ #define DP83TG720S_UNKNOWN_083F 0x83f @@ -69,6 +95,113 @@ #define DP83TG720_SQI_MAX 7 +struct dp83tg720_stats { + u64 link_loss_cnt; + u64 tx_pkt_cnt; + u64 tx_err_pkt_cnt; + u64 rx_pkt_cnt; + u64 rx_err_pkt_cnt; +}; + +struct dp83tg720_priv { + struct dp83tg720_stats stats; +}; + +/** + * dp83tg720_update_stats - Update the PHY statistics for the DP83TD510 PHY. + * @phydev: Pointer to the phy_device structure. + * + * The function reads the PHY statistics registers and updates the statistics + * structure. + * + * Returns: 0 on success or a negative error code on failure. + */ +static int dp83tg720_update_stats(struct phy_device *phydev) +{ + struct dp83tg720_priv *priv = phydev->priv; + u32 count; + int ret; + + /* Read the link loss count */ + ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TG720S_LINK_QUAL_3); + if (ret < 0) + return ret; + /* link_loss_cnt */ + count = FIELD_GET(DP83TG720S_LINK_LOSS_CNT_MASK, ret); + priv->stats.link_loss_cnt += count; + + /* The DP83TG720S_PKT_STAT registers are divided into two groups: + * - Group 1 (TX stats): DP83TG720S_PKT_STAT_1 to DP83TG720S_PKT_STAT_3 + * - Group 2 (RX stats): DP83TG720S_PKT_STAT_4 to DP83TG720S_PKT_STAT_6 + * + * Registers in each group are cleared only after reading them in a + * plain sequence (e.g., 1, 2, 3 for Group 1 or 4, 5, 6 for Group 2). + * Any deviation from the sequence, such as reading 1, 2, 1, 2, 3, will + * prevent the group from being cleared. Additionally, the counters + * for a group are frozen as soon as the first register in that group + * is accessed. + */ + ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TG720S_PKT_STAT_1); + if (ret < 0) + return ret; + /* tx_pkt_cnt_15_0 */ + count = ret; + + ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TG720S_PKT_STAT_2); + if (ret < 0) + return ret; + /* tx_pkt_cnt_31_16 */ + count |= ret << 16; + priv->stats.tx_pkt_cnt += count; + + ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TG720S_PKT_STAT_3); + if (ret < 0) + return ret; + /* tx_err_pkt_cnt */ + priv->stats.tx_err_pkt_cnt += ret; + + ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TG720S_PKT_STAT_4); + if (ret < 0) + return ret; + /* rx_pkt_cnt_15_0 */ + count = ret; + + ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TG720S_PKT_STAT_5); + if (ret < 0) + return ret; + /* rx_pkt_cnt_31_16 */ + count |= ret << 16; + priv->stats.rx_pkt_cnt += count; + + ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TG720S_PKT_STAT_6); + if (ret < 0) + return ret; + /* rx_err_pkt_cnt */ + priv->stats.rx_err_pkt_cnt += ret; + + return 0; +} + +static void dp83tg720_get_link_stats(struct phy_device *phydev, + struct ethtool_link_ext_stats *link_stats) +{ + struct dp83tg720_priv *priv = phydev->priv; + + link_stats->link_down_events = priv->stats.link_loss_cnt; +} + +static void dp83tg720_get_phy_stats(struct phy_device *phydev, + struct ethtool_eth_phy_stats *eth_stats, + struct ethtool_phy_stats *stats) +{ + struct dp83tg720_priv *priv = phydev->priv; + + stats->tx_packets = priv->stats.tx_pkt_cnt; + stats->tx_errors = priv->stats.tx_err_pkt_cnt; + stats->rx_packets = priv->stats.rx_pkt_cnt; + stats->rx_errors = priv->stats.rx_err_pkt_cnt; +} + /** * dp83tg720_cable_test_start - Start the cable test for the DP83TG720 PHY. * @phydev: Pointer to the phy_device structure. @@ -182,6 +315,11 @@ static int dp83tg720_cable_test_get_status(struct phy_device *phydev, ethnl_cable_test_result(phydev, ETHTOOL_A_CABLE_PAIR_A, stat); + /* save the current stats before resetting the PHY */ + ret = dp83tg720_update_stats(phydev); + if (ret) + return ret; + return phy_init_hw(phydev); } @@ -217,6 +355,11 @@ static int dp83tg720_read_status(struct phy_device *phydev) phy_sts = phy_read(phydev, DP83TG720S_MII_REG_10); phydev->link = !!(phy_sts & DP83TG720S_LINK_STATUS); if (!phydev->link) { + /* save the current stats before resetting the PHY */ + ret = dp83tg720_update_stats(phydev); + if (ret) + return ret; + /* According to the "DP83TC81x, DP83TG72x Software * Implementation Guide", the PHY needs to be reset after a * link loss or if no link is created after at least 100ms. @@ -341,12 +484,27 @@ static int dp83tg720_config_init(struct phy_device *phydev) return genphy_c45_pma_baset1_read_master_slave(phydev); } +static int dp83tg720_probe(struct phy_device *phydev) +{ + struct device *dev = &phydev->mdio.dev; + struct dp83tg720_priv *priv; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + phydev->priv = priv; + + return 0; +} + static struct phy_driver dp83tg720_driver[] = { { PHY_ID_MATCH_MODEL(DP83TG720S_PHY_ID), .name = "TI DP83TG720S", .flags = PHY_POLL_CABLE_TEST, + .probe = dp83tg720_probe, .config_aneg = dp83tg720_config_aneg, .read_status = dp83tg720_read_status, .get_features = genphy_c45_pma_read_ext_abilities, @@ -355,6 +513,9 @@ static struct phy_driver dp83tg720_driver[] = { .get_sqi_max = dp83tg720_get_sqi_max, .cable_test_start = dp83tg720_cable_test_start, .cable_test_get_status = dp83tg720_cable_test_get_status, + .get_link_stats = dp83tg720_get_link_stats, + .get_phy_stats = dp83tg720_get_phy_stats, + .update_stats = dp83tg720_update_stats, .suspend = genphy_suspend, .resume = genphy_resume, diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index eeb33eb181ac..0fd1cb676cd5 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -2006,7 +2006,7 @@ static int ksz9477_config_init(struct phy_device *phydev) * in this switch shall be regarded as broken. */ if (phydev->dev_flags & MICREL_NO_EEE) - linkmode_fill(phydev->eee_broken_modes); + phy_disable_eee(phydev); return kszphy_config_init(phydev); } diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index e4b04cdaa995..c008fe050245 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -616,6 +616,49 @@ int phy_ethtool_get_stats(struct phy_device *phydev, EXPORT_SYMBOL(phy_ethtool_get_stats); /** + * __phy_ethtool_get_phy_stats - Retrieve standardized PHY statistics + * @phydev: Pointer to the PHY device + * @phy_stats: Pointer to ethtool_eth_phy_stats structure + * @phydev_stats: Pointer to ethtool_phy_stats structure + * + * Fetches PHY statistics using a kernel-defined interface for consistent + * diagnostics. Unlike phy_ethtool_get_stats(), which allows custom stats, + * this function enforces a standardized format for better interoperability. + */ +void __phy_ethtool_get_phy_stats(struct phy_device *phydev, + struct ethtool_eth_phy_stats *phy_stats, + struct ethtool_phy_stats *phydev_stats) +{ + if (!phydev->drv || !phydev->drv->get_phy_stats) + return; + + mutex_lock(&phydev->lock); + phydev->drv->get_phy_stats(phydev, phy_stats, phydev_stats); + mutex_unlock(&phydev->lock); +} + +/** + * __phy_ethtool_get_link_ext_stats - Retrieve extended link statistics for a PHY + * @phydev: Pointer to the PHY device + * @link_stats: Pointer to the structure to store extended link statistics + * + * Populates the ethtool_link_ext_stats structure with link down event counts + * and additional driver-specific link statistics, if available. + */ +void __phy_ethtool_get_link_ext_stats(struct phy_device *phydev, + struct ethtool_link_ext_stats *link_stats) +{ + link_stats->link_down_events = READ_ONCE(phydev->link_down_events); + + if (!phydev->drv || !phydev->drv->get_link_stats) + return; + + mutex_lock(&phydev->lock); + phydev->drv->get_link_stats(phydev, link_stats); + mutex_unlock(&phydev->lock); +} + +/** * phy_ethtool_get_plca_cfg - Get PLCA RS configuration * @phydev: the phy_device struct * @plca_cfg: where to store the retrieved configuration @@ -1399,6 +1442,23 @@ static int phy_enable_interrupts(struct phy_device *phydev) } /** + * phy_update_stats - Update PHY device statistics if supported. + * @phydev: Pointer to the PHY device structure. + * + * If the PHY driver provides an update_stats callback, this function + * invokes it to update the PHY statistics. If not, it returns 0. + * + * Return: 0 on success, or a negative error code if the callback fails. + */ +static int phy_update_stats(struct phy_device *phydev) +{ + if (!phydev->drv->update_stats) + return 0; + + return phydev->drv->update_stats(phydev); +} + +/** * phy_request_interrupt - request and enable interrupt for a PHY device * @phydev: target phy_device struct * @@ -1467,6 +1527,9 @@ static enum phy_state_work _phy_state_machine(struct phy_device *phydev) case PHY_RUNNING: err = phy_check_link_status(phydev); func = &phy_check_link_status; + + if (!err) + err = phy_update_stats(phydev); break; case PHY_CABLETEST: err = phydev->drv->cable_test_get_status(phydev, &finished); @@ -1641,6 +1704,27 @@ void phy_mac_interrupt(struct phy_device *phydev) EXPORT_SYMBOL(phy_mac_interrupt); /** + * phy_eee_rx_clock_stop() - configure PHY receive clock in LPI + * @phydev: target phy_device struct + * @clk_stop_enable: flag to indicate whether the clock can be stopped + * + * Configure whether the PHY can disable its receive clock during LPI mode, + * See IEEE 802.3 sections 22.2.2.2, 35.2.2.10, and 45.2.3.1.4. + * + * Returns: 0 or negative error. + */ +int phy_eee_rx_clock_stop(struct phy_device *phydev, bool clk_stop_enable) +{ + /* Configure the PHY to stop receiving xMII + * clock while it is signaling LPI. + */ + return phy_modify_mmd(phydev, MDIO_MMD_PCS, MDIO_CTRL1, + MDIO_PCS_CTRL1_CLKSTOP_EN, + clk_stop_enable ? MDIO_PCS_CTRL1_CLKSTOP_EN : 0); +} +EXPORT_SYMBOL_GPL(phy_eee_rx_clock_stop); + +/** * phy_init_eee - init and check the EEE feature * @phydev: target phy_device struct * @clk_stop_enable: PHY may stop the clock during LPI @@ -1664,11 +1748,7 @@ int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable) return -EPROTONOSUPPORT; if (clk_stop_enable) - /* Configure the PHY to stop receiving xMII - * clock while it is signaling LPI. - */ - ret = phy_set_bits_mmd(phydev, MDIO_MMD_PCS, MDIO_CTRL1, - MDIO_PCS_CTRL1_CLKSTOP_EN); + ret = phy_eee_rx_clock_stop(phydev, true); return ret < 0 ? ret : 0; } diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index bdc997f59779..5b34d39d1d52 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -3800,6 +3800,8 @@ static const struct ethtool_phy_ops phy_ethtool_phy_ops = { static const struct phylib_stubs __phylib_stubs = { .hwtstamp_get = __phy_hwtstamp_get, .hwtstamp_set = __phy_hwtstamp_set, + .get_phy_stats = __phy_ethtool_get_phy_stats, + .get_link_ext_stats = __phy_ethtool_get_link_ext_stats, }; static void phylib_register_stubs(void) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 6d50c2fdb190..31754d5fd659 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -691,6 +691,17 @@ static int phylink_validate_mac_and_pcs(struct phylink *pl, return -EINVAL; } + /* Ensure that this PCS supports the interface which the MAC + * returned it for. It is an error for the MAC to return a PCS + * that does not support the interface mode. + */ + if (!phy_interface_empty(pcs->supported_interfaces) && + !test_bit(state->interface, pcs->supported_interfaces)) { + phylink_err(pl, "MAC returned PCS which does not support %s\n", + phy_modes(state->interface)); + return -EINVAL; + } + /* Validate the link parameters with the PCS */ if (pcs->ops->pcs_validate) { ret = pcs->ops->pcs_validate(pcs, supported, state); diff --git a/drivers/nfc/st21nfca/dep.c b/drivers/nfc/st21nfca/dep.c index 1ec651e31064..3425b68f0ddc 100644 --- a/drivers/nfc/st21nfca/dep.c +++ b/drivers/nfc/st21nfca/dep.c @@ -116,18 +116,16 @@ static void st21nfca_tx_work(struct work_struct *work) struct nfc_dev *dev; struct sk_buff *skb; - if (info) { - dev = info->hdev->ndev; - skb = info->dep_info.tx_pending; + dev = info->hdev->ndev; + skb = info->dep_info.tx_pending; - device_lock(&dev->dev); + device_lock(&dev->dev); - nfc_hci_send_cmd_async(info->hdev, ST21NFCA_RF_READER_F_GATE, - ST21NFCA_WR_XCHG_DATA, skb->data, skb->len, - info->async_cb, info); - device_unlock(&dev->dev); - kfree_skb(skb); - } + nfc_hci_send_cmd_async(info->hdev, ST21NFCA_RF_READER_F_GATE, + ST21NFCA_WR_XCHG_DATA, skb->data, skb->len, + info->async_cb, info); + device_unlock(&dev->dev); + kfree_skb(skb); } static void st21nfca_im_send_pdu(struct st21nfca_hci_info *info, diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 611b02c8a8b3..c4bb8dfe1a45 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -173,6 +173,11 @@ enum nvme_quirks { * MSI (but not MSI-X) interrupts are broken and never fire. */ NVME_QUIRK_BROKEN_MSI = (1 << 21), + + /* + * Align dma pool segment size to 512 bytes + */ + NVME_QUIRK_DMAPOOL_ALIGN_512 = (1 << 22), }; /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 1a5ba80f1811..e2634f437f33 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2834,15 +2834,20 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) static int nvme_setup_prp_pools(struct nvme_dev *dev) { + size_t small_align = 256; + dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, NVME_CTRL_PAGE_SIZE, NVME_CTRL_PAGE_SIZE, 0); if (!dev->prp_page_pool) return -ENOMEM; + if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512) + small_align = 512; + /* Optimisation for I/Os between 4k and 128k */ dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev, - 256, 256, 0); + 256, small_align, 0); if (!dev->prp_small_pool) { dma_pool_destroy(dev->prp_page_pool); return -ENOMEM; @@ -3607,7 +3612,7 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1217, 0x8760), /* O2 Micro 64GB Steam Deck */ - .driver_data = NVME_QUIRK_QDEPTH_ONE }, + .driver_data = NVME_QUIRK_DMAPOOL_ALIGN_512, }, { PCI_DEVICE(0x126f, 0x2262), /* Silicon Motion generic */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS | NVME_QUIRK_BOGUS_NID, }, diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 28c76a3e1bd2..b127d41dbbfe 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2024,14 +2024,6 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) return __nvme_tcp_alloc_io_queues(ctrl); } -static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove) -{ - nvme_tcp_stop_io_queues(ctrl); - if (remove) - nvme_remove_io_tag_set(ctrl); - nvme_tcp_free_io_queues(ctrl); -} - static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) { int ret, nr_queues; @@ -2176,9 +2168,11 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); nvme_cancel_tagset(ctrl); - if (remove) + if (remove) { nvme_unquiesce_io_queues(ctrl); - nvme_tcp_destroy_io_queues(ctrl, remove); + nvme_remove_io_tag_set(ctrl); + } + nvme_tcp_free_io_queues(ctrl); } static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl, @@ -2267,7 +2261,9 @@ destroy_io: nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); nvme_cancel_tagset(ctrl); - nvme_tcp_destroy_io_queues(ctrl, new); + if (new) + nvme_remove_io_tag_set(ctrl); + nvme_tcp_free_io_queues(ctrl); } destroy_admin: nvme_stop_keep_alive(ctrl); diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 2962794ce881..fa89b0549c36 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -139,7 +139,7 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req, unsigned long idx; ctrl = req->sq->ctrl; - xa_for_each(&ctrl->subsys->namespaces, idx, ns) { + nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) { /* we don't have the right data for file backed ns */ if (!ns->bdev) continue; @@ -331,9 +331,10 @@ static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid, u32 count = 0; if (!(req->cmd->get_log_page.lsp & NVME_ANA_LOG_RGO)) { - xa_for_each(&ctrl->subsys->namespaces, idx, ns) + nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) { if (ns->anagrpid == grpid) desc->nsids[count++] = cpu_to_le32(ns->nsid); + } } desc->grpid = cpu_to_le32(grpid); @@ -772,7 +773,7 @@ static void nvmet_execute_identify_endgrp_list(struct nvmet_req *req) goto out; } - xa_for_each(&ctrl->subsys->namespaces, idx, ns) { + nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) { if (ns->nsid <= min_endgid) continue; @@ -815,7 +816,7 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req, bool match_css) goto out; } - xa_for_each(&ctrl->subsys->namespaces, idx, ns) { + nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) { if (ns->nsid <= min_nsid) continue; if (match_css && req->ns->csi != req->cmd->identify.csi) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index eeee9e9b854c..2b030f0efc38 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -810,18 +810,6 @@ static struct configfs_attribute *nvmet_ns_attrs[] = { NULL, }; -bool nvmet_subsys_nsid_exists(struct nvmet_subsys *subsys, u32 nsid) -{ - struct config_item *ns_item; - char name[12]; - - snprintf(name, sizeof(name), "%u", nsid); - mutex_lock(&subsys->namespaces_group.cg_subsys->su_mutex); - ns_item = config_group_find_item(&subsys->namespaces_group, name); - mutex_unlock(&subsys->namespaces_group.cg_subsys->su_mutex); - return ns_item != NULL; -} - static void nvmet_ns_release(struct config_item *item) { struct nvmet_ns *ns = to_nvmet_ns(item); @@ -2254,12 +2242,17 @@ static ssize_t nvmet_root_discovery_nqn_store(struct config_item *item, const char *page, size_t count) { struct list_head *entry; + char *old_nqn, *new_nqn; size_t len; len = strcspn(page, "\n"); if (!len || len > NVMF_NQN_FIELD_LEN - 1) return -EINVAL; + new_nqn = kstrndup(page, len, GFP_KERNEL); + if (!new_nqn) + return -ENOMEM; + down_write(&nvmet_config_sem); list_for_each(entry, &nvmet_subsystems_group.cg_children) { struct config_item *item = @@ -2268,13 +2261,15 @@ static ssize_t nvmet_root_discovery_nqn_store(struct config_item *item, if (!strncmp(config_item_name(item), page, len)) { pr_err("duplicate NQN %s\n", config_item_name(item)); up_write(&nvmet_config_sem); + kfree(new_nqn); return -EINVAL; } } - memset(nvmet_disc_subsys->subsysnqn, 0, NVMF_NQN_FIELD_LEN); - memcpy(nvmet_disc_subsys->subsysnqn, page, len); + old_nqn = nvmet_disc_subsys->subsysnqn; + nvmet_disc_subsys->subsysnqn = new_nqn; up_write(&nvmet_config_sem); + kfree(old_nqn); return len; } diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 1f4e9989663b..fde6c555af61 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -127,7 +127,7 @@ static u32 nvmet_max_nsid(struct nvmet_subsys *subsys) unsigned long idx; u32 nsid = 0; - xa_for_each(&subsys->namespaces, idx, cur) + nvmet_for_each_enabled_ns(&subsys->namespaces, idx, cur) nsid = cur->nsid; return nsid; @@ -441,11 +441,14 @@ u16 nvmet_req_find_ns(struct nvmet_req *req) struct nvmet_subsys *subsys = nvmet_req_subsys(req); req->ns = xa_load(&subsys->namespaces, nsid); - if (unlikely(!req->ns)) { + if (unlikely(!req->ns || !req->ns->enabled)) { req->error_loc = offsetof(struct nvme_common_command, nsid); - if (nvmet_subsys_nsid_exists(subsys, nsid)) - return NVME_SC_INTERNAL_PATH_ERROR; - return NVME_SC_INVALID_NS | NVME_STATUS_DNR; + if (!req->ns) /* ns doesn't exist! */ + return NVME_SC_INVALID_NS | NVME_STATUS_DNR; + + /* ns exists but it's disabled */ + req->ns = NULL; + return NVME_SC_INTERNAL_PATH_ERROR; } percpu_ref_get(&req->ns->ref); @@ -583,8 +586,6 @@ int nvmet_ns_enable(struct nvmet_ns *ns) goto out_unlock; ret = -EMFILE; - if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES) - goto out_unlock; ret = nvmet_bdev_ns_enable(ns); if (ret == -ENOTBLK) @@ -599,38 +600,19 @@ int nvmet_ns_enable(struct nvmet_ns *ns) list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) nvmet_p2pmem_ns_add_p2p(ctrl, ns); - ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace, - 0, GFP_KERNEL); - if (ret) - goto out_dev_put; - - if (ns->nsid > subsys->max_nsid) - subsys->max_nsid = ns->nsid; - - ret = xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL); - if (ret) - goto out_restore_subsys_maxnsid; - if (ns->pr.enable) { ret = nvmet_pr_init_ns(ns); if (ret) - goto out_remove_from_subsys; + goto out_dev_put; } - subsys->nr_namespaces++; - nvmet_ns_changed(subsys, ns->nsid); ns->enabled = true; + xa_set_mark(&subsys->namespaces, ns->nsid, NVMET_NS_ENABLED); ret = 0; out_unlock: mutex_unlock(&subsys->lock); return ret; - -out_remove_from_subsys: - xa_erase(&subsys->namespaces, ns->nsid); -out_restore_subsys_maxnsid: - subsys->max_nsid = nvmet_max_nsid(subsys); - percpu_ref_exit(&ns->ref); out_dev_put: list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid)); @@ -649,15 +631,37 @@ void nvmet_ns_disable(struct nvmet_ns *ns) goto out_unlock; ns->enabled = false; - xa_erase(&ns->subsys->namespaces, ns->nsid); - if (ns->nsid == subsys->max_nsid) - subsys->max_nsid = nvmet_max_nsid(subsys); + xa_clear_mark(&subsys->namespaces, ns->nsid, NVMET_NS_ENABLED); list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid)); mutex_unlock(&subsys->lock); + if (ns->pr.enable) + nvmet_pr_exit_ns(ns); + + mutex_lock(&subsys->lock); + nvmet_ns_changed(subsys, ns->nsid); + nvmet_ns_dev_disable(ns); +out_unlock: + mutex_unlock(&subsys->lock); +} + +void nvmet_ns_free(struct nvmet_ns *ns) +{ + struct nvmet_subsys *subsys = ns->subsys; + + nvmet_ns_disable(ns); + + mutex_lock(&subsys->lock); + + xa_erase(&subsys->namespaces, ns->nsid); + if (ns->nsid == subsys->max_nsid) + subsys->max_nsid = nvmet_max_nsid(subsys); + + mutex_unlock(&subsys->lock); + /* * Now that we removed the namespaces from the lookup list, we * can kill the per_cpu ref and wait for any remaining references @@ -671,21 +675,9 @@ void nvmet_ns_disable(struct nvmet_ns *ns) wait_for_completion(&ns->disable_done); percpu_ref_exit(&ns->ref); - if (ns->pr.enable) - nvmet_pr_exit_ns(ns); - mutex_lock(&subsys->lock); - subsys->nr_namespaces--; - nvmet_ns_changed(subsys, ns->nsid); - nvmet_ns_dev_disable(ns); -out_unlock: mutex_unlock(&subsys->lock); -} - -void nvmet_ns_free(struct nvmet_ns *ns) -{ - nvmet_ns_disable(ns); down_write(&nvmet_ana_sem); nvmet_ana_group_enabled[ns->anagrpid]--; @@ -699,15 +691,33 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) { struct nvmet_ns *ns; + mutex_lock(&subsys->lock); + + if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES) + goto out_unlock; + ns = kzalloc(sizeof(*ns), GFP_KERNEL); if (!ns) - return NULL; + goto out_unlock; init_completion(&ns->disable_done); ns->nsid = nsid; ns->subsys = subsys; + if (percpu_ref_init(&ns->ref, nvmet_destroy_namespace, 0, GFP_KERNEL)) + goto out_free; + + if (ns->nsid > subsys->max_nsid) + subsys->max_nsid = nsid; + + if (xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL)) + goto out_exit; + + subsys->nr_namespaces++; + + mutex_unlock(&subsys->lock); + down_write(&nvmet_ana_sem); ns->anagrpid = NVMET_DEFAULT_ANA_GRPID; nvmet_ana_group_enabled[ns->anagrpid]++; @@ -718,6 +728,14 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) ns->csi = NVME_CSI_NVM; return ns; +out_exit: + subsys->max_nsid = nvmet_max_nsid(subsys); + percpu_ref_exit(&ns->ref); +out_free: + kfree(ns); +out_unlock: + mutex_unlock(&subsys->lock); + return NULL; } static void nvmet_update_sq_head(struct nvmet_req *req) @@ -1394,7 +1412,7 @@ static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl, ctrl->p2p_client = get_device(req->p2p_client); - xa_for_each(&ctrl->subsys->namespaces, idx, ns) + nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) nvmet_p2pmem_ns_add_p2p(ctrl, ns); } diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 0bda83d0fc3e..eaf31c823cbe 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -36,7 +36,7 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) */ id->nsfeat |= 1 << 4; /* NPWG = Namespace Preferred Write Granularity. 0's based */ - id->npwg = lpp0b; + id->npwg = to0based(bdev_io_min(bdev) / bdev_logical_block_size(bdev)); /* NPWA = Namespace Preferred Write Alignment. 0's based */ id->npwa = id->npwg; /* NPDG = Namespace Preferred Deallocate Granularity. 0's based */ diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 58328b35dc96..7233549f7c8a 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -24,6 +24,7 @@ #define NVMET_DEFAULT_VS NVME_VS(2, 1, 0) +#define NVMET_NS_ENABLED XA_MARK_1 #define NVMET_ASYNC_EVENTS 4 #define NVMET_ERROR_LOG_SLOTS 128 #define NVMET_NO_ERROR_LOC ((u16)-1) @@ -33,6 +34,12 @@ #define NVMET_FR_MAX_SIZE 8 #define NVMET_PR_LOG_QUEUE_SIZE 64 +#define nvmet_for_each_ns(xa, index, entry) \ + xa_for_each(xa, index, entry) + +#define nvmet_for_each_enabled_ns(xa, index, entry) \ + xa_for_each_marked(xa, index, entry, NVMET_NS_ENABLED) + /* * Supported optional AENs: */ diff --git a/drivers/nvme/target/pr.c b/drivers/nvme/target/pr.c index 90e9f5bbe581..cd22d8333314 100644 --- a/drivers/nvme/target/pr.c +++ b/drivers/nvme/target/pr.c @@ -60,7 +60,7 @@ u16 nvmet_set_feat_resv_notif_mask(struct nvmet_req *req, u32 mask) goto success; } - xa_for_each(&ctrl->subsys->namespaces, idx, ns) { + nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) { if (ns->pr.enable) WRITE_ONCE(ns->pr.notify_mask, mask); } @@ -1056,7 +1056,7 @@ int nvmet_ctrl_init_pr(struct nvmet_ctrl *ctrl) * nvmet_pr_init_ns(), see more details in nvmet_ns_enable(). * So just check ns->pr.enable. */ - xa_for_each(&subsys->namespaces, idx, ns) { + nvmet_for_each_enabled_ns(&subsys->namespaces, idx, ns) { if (ns->pr.enable) { ret = nvmet_pr_alloc_and_insert_pc_ref(ns, ctrl->cntlid, &ctrl->hostid); @@ -1067,7 +1067,7 @@ int nvmet_ctrl_init_pr(struct nvmet_ctrl *ctrl) return 0; free_per_ctrl_refs: - xa_for_each(&subsys->namespaces, idx, ns) { + nvmet_for_each_enabled_ns(&subsys->namespaces, idx, ns) { if (ns->pr.enable) { pc_ref = xa_erase(&ns->pr_per_ctrl_refs, ctrl->cntlid); if (pc_ref) @@ -1087,7 +1087,7 @@ void nvmet_ctrl_destroy_pr(struct nvmet_ctrl *ctrl) kfifo_free(&ctrl->pr_log_mgr.log_queue); mutex_destroy(&ctrl->pr_log_mgr.lock); - xa_for_each(&ctrl->subsys->namespaces, idx, ns) { + nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) { if (ns->pr.enable) { pc_ref = xa_erase(&ns->pr_per_ctrl_refs, ctrl->cntlid); if (pc_ref) diff --git a/drivers/ufs/core/ufshcd-priv.h b/drivers/ufs/core/ufshcd-priv.h index 9ffd94ddf8c7..786f20ef2238 100644 --- a/drivers/ufs/core/ufshcd-priv.h +++ b/drivers/ufs/core/ufshcd-priv.h @@ -237,12 +237,6 @@ static inline void ufshcd_vops_config_scaling_param(struct ufs_hba *hba, hba->vops->config_scaling_param(hba, p, data); } -static inline void ufshcd_vops_reinit_notify(struct ufs_hba *hba) -{ - if (hba->vops && hba->vops->reinit_notify) - hba->vops->reinit_notify(hba); -} - static inline int ufshcd_vops_mcq_config_resource(struct ufs_hba *hba) { if (hba->vops && hba->vops->mcq_config_resource) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 8a01e4393159..9c26e8767515 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -8858,7 +8858,6 @@ static int ufshcd_probe_hba(struct ufs_hba *hba, bool init_dev_params) ufshcd_device_reset(hba); ufs_put_device_desc(hba); ufshcd_hba_stop(hba); - ufshcd_vops_reinit_notify(hba); ret = ufshcd_hba_enable(hba); if (ret) { dev_err(hba->dev, "Host controller enable failed\n"); @@ -10591,14 +10590,17 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) } /* - * Set the default power management level for runtime and system PM. + * Set the default power management level for runtime and system PM if + * not set by the host controller drivers. * Default power saving mode is to keep UFS link in Hibern8 state * and UFS device in sleep state. */ - hba->rpm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state( + if (!hba->rpm_lvl) + hba->rpm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state( UFS_SLEEP_PWR_MODE, UIC_LINK_HIBERN8_STATE); - hba->spm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state( + if (!hba->spm_lvl) + hba->spm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state( UFS_SLEEP_PWR_MODE, UIC_LINK_HIBERN8_STATE); diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c index 68040b2ab5f8..91e94fe990b4 100644 --- a/drivers/ufs/host/ufs-qcom.c +++ b/drivers/ufs/host/ufs-qcom.c @@ -368,6 +368,11 @@ static int ufs_qcom_power_up_sequence(struct ufs_hba *hba) if (ret) return ret; + if (phy->power_count) { + phy_power_off(phy); + phy_exit(phy); + } + /* phy initialization - calibrate the phy */ ret = phy_init(phy); if (ret) { @@ -866,6 +871,7 @@ static u32 ufs_qcom_get_ufs_hci_version(struct ufs_hba *hba) */ static void ufs_qcom_advertise_quirks(struct ufs_hba *hba) { + const struct ufs_qcom_drvdata *drvdata = of_device_get_match_data(hba->dev); struct ufs_qcom_host *host = ufshcd_get_variant(hba); if (host->hw_ver.major == 0x2) @@ -874,9 +880,8 @@ static void ufs_qcom_advertise_quirks(struct ufs_hba *hba) if (host->hw_ver.major > 0x3) hba->quirks |= UFSHCD_QUIRK_REINIT_AFTER_MAX_GEAR_SWITCH; - if (of_device_is_compatible(hba->dev->of_node, "qcom,sm8550-ufshc") || - of_device_is_compatible(hba->dev->of_node, "qcom,sm8650-ufshc")) - hba->quirks |= UFSHCD_QUIRK_BROKEN_LSDBS_CAP; + if (drvdata && drvdata->quirks) + hba->quirks |= drvdata->quirks; } static void ufs_qcom_set_phy_gear(struct ufs_qcom_host *host) @@ -1064,6 +1069,7 @@ static int ufs_qcom_init(struct ufs_hba *hba) struct device *dev = hba->dev; struct ufs_qcom_host *host; struct ufs_clk_info *clki; + const struct ufs_qcom_drvdata *drvdata = of_device_get_match_data(hba->dev); host = devm_kzalloc(dev, sizeof(*host), GFP_KERNEL); if (!host) @@ -1143,6 +1149,9 @@ static int ufs_qcom_init(struct ufs_hba *hba) dev_warn(dev, "%s: failed to configure the testbus %d\n", __func__, err); + if (drvdata && drvdata->no_phy_retention) + hba->spm_lvl = UFS_PM_LVL_5; + return 0; out_variant_clear: @@ -1579,13 +1588,6 @@ static void ufs_qcom_config_scaling_param(struct ufs_hba *hba, } #endif -static void ufs_qcom_reinit_notify(struct ufs_hba *hba) -{ - struct ufs_qcom_host *host = ufshcd_get_variant(hba); - - phy_power_off(host->generic_phy); -} - /* Resources */ static const struct ufshcd_res_info ufs_res_info[RES_MAX] = { {.name = "ufs_mem",}, @@ -1825,7 +1827,6 @@ static const struct ufs_hba_variant_ops ufs_hba_qcom_vops = { .device_reset = ufs_qcom_device_reset, .config_scaling_param = ufs_qcom_config_scaling_param, .program_key = ufs_qcom_ice_program_key, - .reinit_notify = ufs_qcom_reinit_notify, .mcq_config_resource = ufs_qcom_mcq_config_resource, .get_hba_mac = ufs_qcom_get_hba_mac, .op_runtime_config = ufs_qcom_op_runtime_config, @@ -1868,9 +1869,15 @@ static void ufs_qcom_remove(struct platform_device *pdev) platform_device_msi_free_irqs_all(hba->dev); } +static const struct ufs_qcom_drvdata ufs_qcom_sm8550_drvdata = { + .quirks = UFSHCD_QUIRK_BROKEN_LSDBS_CAP, + .no_phy_retention = true, +}; + static const struct of_device_id ufs_qcom_of_match[] __maybe_unused = { { .compatible = "qcom,ufshc" }, - { .compatible = "qcom,sm8550-ufshc" }, + { .compatible = "qcom,sm8550-ufshc", .data = &ufs_qcom_sm8550_drvdata }, + { .compatible = "qcom,sm8650-ufshc", .data = &ufs_qcom_sm8550_drvdata }, {}, }; MODULE_DEVICE_TABLE(of, ufs_qcom_of_match); diff --git a/drivers/ufs/host/ufs-qcom.h b/drivers/ufs/host/ufs-qcom.h index b9de170983c9..919f53682beb 100644 --- a/drivers/ufs/host/ufs-qcom.h +++ b/drivers/ufs/host/ufs-qcom.h @@ -217,6 +217,11 @@ struct ufs_qcom_host { bool esi_enabled; }; +struct ufs_qcom_drvdata { + enum ufshcd_quirks quirks; + bool no_phy_retention; +}; + static inline u32 ufs_qcom_get_debug_reg_offset(struct ufs_qcom_host *host, u32 reg) { diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 1ab58da9f38a..1a4ed5a357d3 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1661,14 +1661,15 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff; vm_fault_t ret = VM_FAULT_SIGBUS; - if (order && (vmf->address & ((PAGE_SIZE << order) - 1) || + pfn = vma_to_pfn(vma) + pgoff; + + if (order && (pfn & ((1 << order) - 1) || + vmf->address & ((PAGE_SIZE << order) - 1) || vmf->address + (PAGE_SIZE << order) > vma->vm_end)) { ret = VM_FAULT_FALLBACK; goto out; } - pfn = vma_to_pfn(vma); - down_read(&vdev->memory_lock); if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) @@ -1676,18 +1677,18 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, switch (order) { case 0: - ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff); + ret = vmf_insert_pfn(vma, vmf->address, pfn); break; #ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP case PMD_ORDER: - ret = vmf_insert_pfn_pmd(vmf, __pfn_to_pfn_t(pfn + pgoff, - PFN_DEV), false); + ret = vmf_insert_pfn_pmd(vmf, + __pfn_to_pfn_t(pfn, PFN_DEV), false); break; #endif #ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP case PUD_ORDER: - ret = vmf_insert_pfn_pud(vmf, __pfn_to_pfn_t(pfn + pgoff, - PFN_DEV), false); + ret = vmf_insert_pfn_pud(vmf, + __pfn_to_pfn_t(pfn, PFN_DEV), false); break; #endif default: diff --git a/drivers/watchdog/stm32_iwdg.c b/drivers/watchdog/stm32_iwdg.c index d700e0d49bb9..8ad06b54c5ad 100644 --- a/drivers/watchdog/stm32_iwdg.c +++ b/drivers/watchdog/stm32_iwdg.c @@ -286,7 +286,7 @@ static int stm32_iwdg_irq_init(struct platform_device *pdev, if (!wdt->data->has_early_wakeup) return 0; - irq = platform_get_irq(pdev, 0); + irq = platform_get_irq_optional(pdev, 0); if (irq <= 0) return 0; diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 819c75233235..3bc9ce6c575e 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -57,6 +57,8 @@ static void v9fs_issue_write(struct netfs_io_subrequest *subreq) int err, len; len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err); + if (len > 0) + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); netfs_write_subrequest_terminated(subreq, len ?: err, false); } @@ -80,8 +82,10 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) if (pos + total >= i_size_read(rreq->inode)) __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); - if (!err) + if (!err) { subreq->transferred += total; + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + } netfs_read_subreq_terminated(subreq, err, false); } diff --git a/fs/afs/write.c b/fs/afs/write.c index 34107b55f834..ccb6aa8027c5 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -122,7 +122,7 @@ static void afs_issue_write_worker(struct work_struct *work) if (subreq->debug_index == 3) return netfs_write_subrequest_terminated(subreq, -ENOANO, false); - if (!test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) { + if (!subreq->retry_count) { set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); return netfs_write_subrequest_terminated(subreq, -EAGAIN, false); } @@ -149,6 +149,9 @@ static void afs_issue_write_worker(struct work_struct *work) afs_wait_for_operation(op); ret = afs_put_operation(op); switch (ret) { + case 0: + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + break; case -EACCES: case -EPERM: case -ENOKEY: diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 3af8bb0c8d75..4d9305fa37a8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4878,25 +4878,29 @@ out_fail: return ret; } +struct btrfs_uring_encoded_data { + struct btrfs_ioctl_encoded_io_args args; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov; + struct iov_iter iter; +}; + static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags) { size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); size_t copy_end; - struct btrfs_ioctl_encoded_io_args args = { 0 }; int ret; u64 disk_bytenr, disk_io_size; struct file *file; struct btrfs_inode *inode; struct btrfs_fs_info *fs_info; struct extent_io_tree *io_tree; - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - struct iov_iter iter; loff_t pos; struct kiocb kiocb; struct extent_state *cached_state = NULL; u64 start, lockend; void __user *sqe_addr; + struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; @@ -4910,43 +4914,64 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue if (issue_flags & IO_URING_F_COMPAT) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) - struct btrfs_ioctl_encoded_io_args_32 args32; - copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags); - if (copy_from_user(&args32, sqe_addr, copy_end)) { - ret = -EFAULT; - goto out_acct; - } - args.iov = compat_ptr(args32.iov); - args.iovcnt = args32.iovcnt; - args.offset = args32.offset; - args.flags = args32.flags; #else return -ENOTTY; #endif } else { copy_end = copy_end_kernel; - if (copy_from_user(&args, sqe_addr, copy_end)) { - ret = -EFAULT; + } + + if (!data) { + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) { + ret = -ENOMEM; goto out_acct; } - } - if (args.flags != 0) - return -EINVAL; + io_uring_cmd_get_async_data(cmd)->op_data = data; - ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), - &iov, &iter); - if (ret < 0) - goto out_acct; + if (issue_flags & IO_URING_F_COMPAT) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; - if (iov_iter_count(&iter) == 0) { - ret = 0; - goto out_free; + if (copy_from_user(&args32, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + + data->args.iov = compat_ptr(args32.iov); + data->args.iovcnt = args32.iovcnt; + data->args.offset = args32.offset; + data->args.flags = args32.flags; +#endif + } else { + if (copy_from_user(&data->args, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + } + + if (data->args.flags != 0) { + ret = -EINVAL; + goto out_acct; + } + + data->iov = data->iovstack; + ret = import_iovec(ITER_DEST, data->args.iov, data->args.iovcnt, + ARRAY_SIZE(data->iovstack), &data->iov, + &data->iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&data->iter) == 0) { + ret = 0; + goto out_free; + } } - pos = args.offset; - ret = rw_verify_area(READ, file, &pos, args.len); + pos = data->args.offset; + ret = rw_verify_area(READ, file, &pos, data->args.len); if (ret < 0) goto out_free; @@ -4959,15 +4984,16 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue start = ALIGN_DOWN(pos, fs_info->sectorsize); lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; - ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state, + ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state, &disk_bytenr, &disk_io_size); if (ret < 0 && ret != -EIOCBQUEUED) goto out_free; file_accessed(file); - if (copy_to_user(sqe_addr + copy_end, (const char *)&args + copy_end_kernel, - sizeof(args) - copy_end_kernel)) { + if (copy_to_user(sqe_addr + copy_end, + (const char *)&data->args + copy_end_kernel, + sizeof(data->args) - copy_end_kernel)) { if (ret == -EIOCBQUEUED) { unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); @@ -4977,40 +5003,22 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue } if (ret == -EIOCBQUEUED) { - u64 count; - - /* - * If we've optimized things by storing the iovecs on the stack, - * undo this. - */ - if (!iov) { - iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS); - if (!iov) { - unlock_extent(io_tree, start, lockend, &cached_state); - btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); - ret = -ENOMEM; - goto out_acct; - } - - memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt); - } - - count = min_t(u64, iov_iter_count(&iter), disk_io_size); + u64 count = min_t(u64, iov_iter_count(&data->iter), disk_io_size); /* Match ioctl by not returning past EOF if uncompressed. */ - if (!args.compression) - count = min_t(u64, count, args.len); + if (!data->args.compression) + count = min_t(u64, count, data->args.len); - ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend, - cached_state, disk_bytenr, - disk_io_size, count, - args.compression, iov, cmd); + ret = btrfs_uring_read_extent(&kiocb, &data->iter, start, lockend, + cached_state, disk_bytenr, disk_io_size, + count, data->args.compression, + data->iov, cmd); goto out_acct; } out_free: - kfree(iov); + kfree(data->iov); out_acct: if (ret > 0) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 204c928beaf9..531312efee8d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1541,6 +1541,10 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, u64 extent_gen; int ret; + if (unlikely(!extent_root)) { + btrfs_err(fs_info, "no valid extent root for scrub"); + return -EUCLEAN; + } memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * stripe->nr_sectors); scrub_stripe_reset_bitmaps(stripe); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index ddf0d5a448a7..c9e92c6941ec 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -174,10 +174,10 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, copy_page(workspace->buf + i * PAGE_SIZE, data_in); start += PAGE_SIZE; - workspace->strm.avail_in = - (in_buf_folios << PAGE_SHIFT); } workspace->strm.next_in = workspace->buf; + workspace->strm.avail_in = min(bytes_left, + in_buf_folios << PAGE_SHIFT); } else { unsigned int pg_off; unsigned int cur_len; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 11ed523e528e..df905ae82929 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -748,8 +748,9 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) (u64)lim->max_segments << PAGE_SHIFT), fs_info->sectorsize); fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; - if (fs_info->max_zone_append_size < fs_info->max_extent_size) - fs_info->max_extent_size = fs_info->max_zone_append_size; + + fs_info->max_extent_size = min_not_zero(fs_info->max_extent_size, + fs_info->max_zone_append_size); /* * Check mount options here, because we might change fs_info->zoned diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 89b11336a836..1806bff8e59b 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -15,6 +15,7 @@ #include <linux/namei.h> #include <linux/poll.h> #include <linux/mount.h> +#include <linux/security.h> #include <linux/statfs.h> #include <linux/ctype.h> #include <linux/string.h> @@ -576,7 +577,7 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args) */ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args) { - char *secctx; + int err; _enter(",%s", args); @@ -585,16 +586,16 @@ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args) return -EINVAL; } - if (cache->secctx) { + if (cache->have_secid) { pr_err("Second security context specified\n"); return -EINVAL; } - secctx = kstrdup(args, GFP_KERNEL); - if (!secctx) - return -ENOMEM; + err = security_secctx_to_secid(args, strlen(args), &cache->secid); + if (err) + return err; - cache->secctx = secctx; + cache->have_secid = true; return 0; } @@ -820,7 +821,6 @@ static void cachefiles_daemon_unbind(struct cachefiles_cache *cache) put_cred(cache->cache_cred); kfree(cache->rootdirname); - kfree(cache->secctx); kfree(cache->tag); _leave(""); diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 7b99bd98de75..38c236e38cef 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -122,7 +122,6 @@ struct cachefiles_cache { #define CACHEFILES_STATE_CHANGED 3 /* T if state changed (poll trigger) */ #define CACHEFILES_ONDEMAND_MODE 4 /* T if in on-demand read mode */ char *rootdirname; /* name of cache root directory */ - char *secctx; /* LSM security context */ char *tag; /* cache binding tag */ refcount_t unbind_pincount;/* refcount to do daemon unbind */ struct xarray reqs; /* xarray of pending on-demand requests */ @@ -130,6 +129,8 @@ struct cachefiles_cache { struct xarray ondemand_ids; /* xarray for ondemand_id allocation */ u32 ondemand_id_next; u32 msg_id_next; + u32 secid; /* LSM security id */ + bool have_secid; /* whether "secid" was set */ }; static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache) diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c index fe777164f1d8..fc6611886b3b 100644 --- a/fs/cachefiles/security.c +++ b/fs/cachefiles/security.c @@ -18,7 +18,7 @@ int cachefiles_get_security_ID(struct cachefiles_cache *cache) struct cred *new; int ret; - _enter("{%s}", cache->secctx); + _enter("{%u}", cache->have_secid ? cache->secid : 0); new = prepare_kernel_cred(current); if (!new) { @@ -26,8 +26,8 @@ int cachefiles_get_security_ID(struct cachefiles_cache *cache) goto error; } - if (cache->secctx) { - ret = set_security_override_from_ctx(new, cache->secctx); + if (cache->have_secid) { + ret = set_security_override(new, cache->secid); if (ret < 0) { put_cred(new); pr_err("Security denies permission to nominate security context: error %d\n", diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index fe0a9b8a0cd0..3103b932b674 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -122,7 +122,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent type = exfat_get_entry_type(ep); if (type == TYPE_UNUSED) { brelse(bh); - break; + goto out; } if (type != TYPE_FILE && type != TYPE_DIR) { @@ -170,6 +170,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent } } +out: dir_entry->namebuf.lfn[0] = '\0'; *cpos = EXFAT_DEN_TO_B(dentry); return 0; diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index 773c320d68f3..9e5492ac409b 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -216,6 +216,16 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain if (err) goto dec_used_clus; + + if (num_clusters >= sbi->num_clusters - EXFAT_FIRST_CLUSTER) { + /* + * The cluster chain includes a loop, scan the + * bitmap to get the number of used clusters. + */ + exfat_count_used_clusters(sb, &sbi->used_clusters); + + return 0; + } } while (clu != EXFAT_EOF_CLUSTER); } diff --git a/fs/exfat/file.c b/fs/exfat/file.c index fb38769c3e39..05b51e721783 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -545,6 +545,7 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size) while (pos < new_valid_size) { u32 len; struct folio *folio; + unsigned long off; len = PAGE_SIZE - (pos & (PAGE_SIZE - 1)); if (pos + len > new_valid_size) @@ -554,6 +555,9 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size) if (err) goto out; + off = offset_in_folio(folio, pos); + folio_zero_new_buffers(folio, off, off + len); + err = ops->write_end(file, mapping, pos, len, len, folio, NULL); if (err < 0) goto out; @@ -563,6 +567,8 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size) cond_resched(); } + return 0; + out: return err; } diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 97d2774760fe..099f80645072 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -330,8 +330,8 @@ static int exfat_find_empty_entry(struct inode *inode, while ((dentry = exfat_search_empty_slot(sb, &hint_femp, p_dir, num_entries, es)) < 0) { - if (dentry == -EIO) - break; + if (dentry != -ENOSPC) + return dentry; if (exfat_check_max_dentries(inode)) return -ENOSPC; diff --git a/fs/file.c b/fs/file.c index fb1011cf6b4a..25c6e53b03f8 100644 --- a/fs/file.c +++ b/fs/file.c @@ -22,6 +22,7 @@ #include <linux/close_range.h> #include <linux/file_ref.h> #include <net/sock.h> +#include <linux/init_task.h> #include "internal.h" diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 494ac372ace0..e540d05549ff 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1681,6 +1681,8 @@ static int fuse_dir_open(struct inode *inode, struct file *file) */ if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE)) nonseekable_open(inode, file); + if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); } return err; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 3bee9b5dba5e..fe09c2093a93 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -349,11 +349,13 @@ static int hfs_fill_super(struct super_block *sb, struct fs_context *fc) goto bail_no_root; res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd); if (!res) { - if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) { + if (fd.entrylength != sizeof(rec.dir)) { res = -EIO; goto bail_hfs_find; } hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength); + if (rec.type != HFS_CDR_DIR) + res = -EIO; } if (res) goto bail_hfs_find; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 955f19e27e47..54dc27d92781 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1774,7 +1774,8 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) */ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct folio *folio, - struct inode *inode, loff_t pos, unsigned len) + struct inode *inode, loff_t pos, loff_t end_pos, + unsigned len) { struct iomap_folio_state *ifs = folio->private; size_t poff = offset_in_folio(folio, pos); @@ -1793,15 +1794,60 @@ new_ioend: if (ifs) atomic_add(len, &ifs->write_bytes_pending); + + /* + * Clamp io_offset and io_size to the incore EOF so that ondisk + * file size updates in the ioend completion are byte-accurate. + * This avoids recovering files with zeroed tail regions when + * writeback races with appending writes: + * + * Thread 1: Thread 2: + * ------------ ----------- + * write [A, A+B] + * update inode size to A+B + * submit I/O [A, A+BS] + * write [A+B, A+B+C] + * update inode size to A+B+C + * <I/O completes, updates disk size to min(A+B+C, A+BS)> + * <power failure> + * + * After reboot: + * 1) with A+B+C < A+BS, the file has zero padding in range + * [A+B, A+B+C] + * + * |< Block Size (BS) >| + * |DDDDDDDDDDDD0000000000000| + * ^ ^ ^ + * A A+B A+B+C + * (EOF) + * + * 2) with A+B+C > A+BS, the file has zero padding in range + * [A+B, A+BS] + * + * |< Block Size (BS) >|< Block Size (BS) >| + * |DDDDDDDDDDDD0000000000000|00000000000000000000000000| + * ^ ^ ^ ^ + * A A+B A+BS A+B+C + * (EOF) + * + * D = Valid Data + * 0 = Zero Padding + * + * Note that this defeats the ability to chain the ioends of + * appending writes. + */ wpc->ioend->io_size += len; + if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos) + wpc->ioend->io_size = end_pos - wpc->ioend->io_offset; + wbc_account_cgroup_owner(wbc, folio, len); return 0; } static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct folio *folio, - struct inode *inode, u64 pos, unsigned dirty_len, - unsigned *count) + struct inode *inode, u64 pos, u64 end_pos, + unsigned dirty_len, unsigned *count) { int error; @@ -1826,7 +1872,7 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, break; default: error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos, - map_len); + end_pos, map_len); if (!error) (*count)++; break; @@ -1897,11 +1943,11 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode, * remaining memory is zeroed when mapped, and writes to that * region are not written out to the file. * - * Also adjust the writeback range to skip all blocks entirely - * beyond i_size. + * Also adjust the end_pos to the end of file and skip writeback + * for all blocks entirely beyond i_size. */ folio_zero_segment(folio, poff, folio_size(folio)); - *end_pos = round_up(isize, i_blocksize(inode)); + *end_pos = isize; } return true; @@ -1914,6 +1960,7 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, struct inode *inode = folio->mapping->host; u64 pos = folio_pos(folio); u64 end_pos = pos + folio_size(folio); + u64 end_aligned = 0; unsigned count = 0; int error = 0; u32 rlen; @@ -1955,9 +2002,10 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, /* * Walk through the folio to find dirty areas to write back. */ - while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) { + end_aligned = round_up(end_pos, i_blocksize(inode)); + while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) { error = iomap_writepage_map_blocks(wpc, wbc, folio, inode, - pos, rlen, &count); + pos, end_pos, rlen, &count); if (error) break; pos += rlen; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 9153ff3a08e7..e8e80761ac73 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -772,9 +772,9 @@ start_journal_io: /* * If the journal is not located on the file system device, * then we must flush the file system device before we issue - * the commit record + * the commit record and update the journal tail sequence. */ - if (commit_transaction->t_need_data_flush && + if ((commit_transaction->t_need_data_flush || update_tail) && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) blkdev_issue_flush(journal->j_fs_dev); diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 4556e4689024..ce63d5fde9c3 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -654,7 +654,7 @@ static void flush_descriptor(journal_t *journal, set_buffer_jwrite(descriptor); BUFFER_TRACE(descriptor, "write"); set_buffer_dirty(descriptor); - write_dirty_buffer(descriptor, REQ_SYNC); + write_dirty_buffer(descriptor, JBD2_JOURNAL_REQ_FLAGS); } #endif diff --git a/fs/namespace.c b/fs/namespace.c index 23e81c2a1e3f..6eec7794f707 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2055,9 +2055,15 @@ SYSCALL_DEFINE1(oldumount, char __user *, name) static bool is_mnt_ns_file(struct dentry *dentry) { + struct ns_common *ns; + /* Is this a proxy for a mount namespace? */ - return dentry->d_op == &ns_dentry_operations && - dentry->d_fsdata == &mntns_operations; + if (dentry->d_op != &ns_dentry_operations) + return false; + + ns = d_inode(dentry)->i_private; + + return ns->ops == &mntns_operations; } struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 7ac34550c403..4dc9b8286355 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -275,22 +275,14 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) netfs_stat(&netfs_n_rh_download); if (rreq->netfs_ops->prepare_read) { ret = rreq->netfs_ops->prepare_read(subreq); - if (ret < 0) { - atomic_dec(&rreq->nr_outstanding); - netfs_put_subrequest(subreq, false, - netfs_sreq_trace_put_cancel); - break; - } + if (ret < 0) + goto prep_failed; trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); } slice = netfs_prepare_read_iterator(subreq); - if (slice < 0) { - atomic_dec(&rreq->nr_outstanding); - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); - ret = slice; - break; - } + if (slice < 0) + goto prep_iter_failed; rreq->netfs_ops->issue_read(subreq); goto done; @@ -302,6 +294,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) trace_netfs_sreq(subreq, netfs_sreq_trace_submit); netfs_stat(&netfs_n_rh_zero); slice = netfs_prepare_read_iterator(subreq); + if (slice < 0) + goto prep_iter_failed; __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); netfs_read_subreq_terminated(subreq, 0, false); goto done; @@ -310,6 +304,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) if (source == NETFS_READ_FROM_CACHE) { trace_netfs_sreq(subreq, netfs_sreq_trace_submit); slice = netfs_prepare_read_iterator(subreq); + if (slice < 0) + goto prep_iter_failed; netfs_read_cache_to_pagecache(rreq, subreq); goto done; } @@ -318,6 +314,14 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) WARN_ON_ONCE(1); break; + prep_iter_failed: + ret = slice; + prep_failed: + subreq->error = ret; + atomic_dec(&rreq->nr_outstanding); + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); + break; + done: size -= slice; start += slice; diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index 88f2adfab75e..173e8b5e6a93 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -104,7 +104,6 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip); wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE); - smp_rmb(); /* Read error/transferred after RIP flag */ ret = wreq->error; if (ret == 0) { ret = wreq->transferred; diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index 3cbb289535a8..e8624f5c7fcc 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -62,10 +62,14 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq, } else { trace_netfs_folio(folio, netfs_folio_trace_read_done); } + + folioq_clear(folioq, slot); } else { // TODO: Use of PG_private_2 is deprecated. if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) netfs_pgpriv2_mark_copy_to_cache(subreq, rreq, folioq, slot); + else + folioq_clear(folioq, slot); } if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { @@ -77,8 +81,6 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq, folio_unlock(folio); } } - - folioq_clear(folioq, slot); } /* @@ -247,16 +249,17 @@ donation_changed: /* Deal with the trickiest case: that this subreq is in the middle of a * folio, not touching either edge, but finishes first. In such a - * case, we donate to the previous subreq, if there is one, so that the - * donation is only handled when that completes - and remove this - * subreq from the list. + * case, we donate to the previous subreq, if there is one and if it is + * contiguous, so that the donation is only handled when that completes + * - and remove this subreq from the list. * * If the previous subreq finished first, we will have acquired their * donation and should be able to unlock folios and/or donate nextwards. */ if (!subreq->consumed && !prev_donated && - !list_is_first(&subreq->rreq_link, &rreq->subrequests)) { + !list_is_first(&subreq->rreq_link, &rreq->subrequests) && + subreq->start == prev->start + prev->len) { prev = list_prev_entry(subreq, rreq_link); WRITE_ONCE(prev->next_donated, prev->next_donated + subreq->len); subreq->start += subreq->len; @@ -378,8 +381,7 @@ static void netfs_rreq_assess(struct netfs_io_request *rreq) task_io_account_read(rreq->transferred); trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip); - clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); - wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); + clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); trace_netfs_rreq(rreq, netfs_rreq_trace_done); netfs_clear_subrequests(rreq, false); @@ -438,7 +440,7 @@ void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, rreq->origin == NETFS_READPAGE || rreq->origin == NETFS_READ_FOR_WRITE)) { netfs_consume_read_data(subreq, was_async); - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); } } EXPORT_SYMBOL(netfs_read_subreq_progress); @@ -497,7 +499,7 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, rreq->origin == NETFS_READPAGE || rreq->origin == NETFS_READ_FOR_WRITE)) { netfs_consume_read_data(subreq, was_async); - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); } rreq->transferred += subreq->transferred; } @@ -511,10 +513,13 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, } else { trace_netfs_sreq(subreq, netfs_sreq_trace_short); if (subreq->transferred > subreq->consumed) { - __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); - set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); - } else if (!__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { + /* If we didn't read new data, abandon retry. */ + if (subreq->retry_count && + test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) { + __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); + } + } else if (test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) { __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); } else { diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c index ba5af89d37fa..54d5004fec18 100644 --- a/fs/netfs/read_pgpriv2.c +++ b/fs/netfs/read_pgpriv2.c @@ -170,6 +170,10 @@ void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq) trace_netfs_write(wreq, netfs_write_trace_copy_to_cache); netfs_stat(&netfs_n_wh_copy_to_cache); + if (!wreq->io_streams[1].avail) { + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + goto couldnt_start; + } for (;;) { error = netfs_pgpriv2_copy_folio(wreq, folio); diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index 0350592ea804..21b4a54e545e 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -49,13 +49,15 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) * up to the first permanently failed one. */ if (!rreq->netfs_ops->prepare_read && - !test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags)) { + !rreq->cache_resources.ops) { struct netfs_io_subrequest *subreq; list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) break; if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { + __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + subreq->retry_count++; netfs_reset_iter(subreq); netfs_reissue_read(rreq, subreq); } @@ -137,7 +139,8 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) stream0->sreq_max_len = subreq->len; __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + subreq->retry_count++; spin_lock_bh(&rreq->lock); list_add_tail(&subreq->rreq_link, &rreq->subrequests); @@ -213,7 +216,6 @@ abandon: subreq->error = -ENOMEM; __clear_bit(NETFS_SREQ_FAILED, &subreq->flags); __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - __clear_bit(NETFS_SREQ_RETRYING, &subreq->flags); } spin_lock_bh(&rreq->lock); list_splice_tail_init(&queue, &rreq->subrequests); diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 1d438be2e1b4..ca3a11ed9b54 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -179,7 +179,6 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, struct iov_iter source = subreq->io_iter; iov_iter_revert(&source, subreq->len - source.count); - __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); netfs_reissue_write(stream, subreq, &source); } @@ -234,7 +233,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, /* Renegotiate max_len (wsize) */ trace_netfs_sreq(subreq, netfs_sreq_trace_retry); __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + subreq->retry_count++; stream->prepare_write(subreq); part = min(len, stream->sreq_max_len); @@ -279,7 +278,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, subreq->start = start; subreq->debug_index = atomic_inc_return(&wreq->subreq_counter); subreq->stream_nr = to->stream_nr; - __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + subreq->retry_count = 1; trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, refcount_read(&subreq->ref), @@ -501,8 +500,7 @@ reassess_streams: goto need_retry; if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { trace_netfs_rreq(wreq, netfs_rreq_trace_unpause); - clear_bit_unlock(NETFS_RREQ_PAUSE, &wreq->flags); - wake_up_bit(&wreq->flags, NETFS_RREQ_PAUSE); + clear_and_wake_up_bit(NETFS_RREQ_PAUSE, &wreq->flags); } if (notes & NEED_REASSESS) { @@ -605,8 +603,7 @@ void netfs_write_collection_worker(struct work_struct *work) _debug("finished"); trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip); - clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags); - wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS); + clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags); if (wreq->iocb) { size_t written = min(wreq->transferred, wreq->len); @@ -714,8 +711,7 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); - clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags); - wake_up_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS); + clear_and_wake_up_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); /* If we are at the head of the queue, wake up the collector, * transferring a ref to it if we were the ones to do so. diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index bf6d507578e5..ff0e82505a0b 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -244,6 +244,8 @@ void netfs_reissue_write(struct netfs_io_stream *stream, iov_iter_advance(source, size); iov_iter_truncate(&subreq->io_iter, size); + subreq->retry_count++; + __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); netfs_do_issue_write(stream, subreq); } diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 810269ee0a50..d49e4ce27999 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -263,6 +263,12 @@ int nfs_netfs_readahead(struct readahead_control *ractl) static atomic_t nfs_netfs_debug_id; static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *file) { + if (!file) { + if (WARN_ON_ONCE(rreq->origin != NETFS_PGPRIV2_COPY_TO_CACHE)) + return -EIO; + return 0; + } + rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file)); rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id); /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ @@ -274,7 +280,8 @@ static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *fi static void nfs_netfs_free_request(struct netfs_io_request *rreq) { - put_nfs_open_context(rreq->netfs_priv); + if (rreq->netfs_priv) + put_nfs_open_context(rreq->netfs_priv); } static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq) diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index dec553034027..e933f9c65d90 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -47,10 +47,8 @@ static void show_mark_fhandle(struct seq_file *m, struct inode *inode) size = f->handle_bytes >> 2; ret = exportfs_encode_fid(inode, (struct fid *)f->f_handle, &size); - if ((ret == FILEID_INVALID) || (ret < 0)) { - WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret); + if ((ret == FILEID_INVALID) || (ret < 0)) return; - } f->handle_type = ret; f->handle_bytes = size * sizeof(u32); diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 2b0daced98eb..3404e7a30c33 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -893,7 +893,7 @@ static int ocfs2_get_next_id(struct super_block *sb, struct kqid *qid) int status = 0; trace_ocfs2_get_next_id(from_kqid(&init_user_ns, *qid), type); - if (!sb_has_quota_loaded(sb, type)) { + if (!sb_has_quota_active(sb, type)) { status = -ESRCH; goto out; } diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 73d3367c533b..2956d888c131 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -867,6 +867,7 @@ out: brelse(oinfo->dqi_libh); brelse(oinfo->dqi_lqi_bh); kfree(oinfo); + info->dqi_priv = NULL; return status; } diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 3601ddfeddc2..0c28e5fa3407 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -415,13 +415,13 @@ int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upperdentry, return err; } -struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, +struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct inode *realinode, bool is_upper) { struct ovl_fh *fh; int fh_type, dwords; int buflen = MAX_HANDLE_SZ; - uuid_t *uuid = &real->d_sb->s_uuid; + uuid_t *uuid = &realinode->i_sb->s_uuid; int err; /* Make sure the real fid stays 32bit aligned */ @@ -438,13 +438,13 @@ struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, * the price or reconnecting the dentry. */ dwords = buflen >> 2; - fh_type = exportfs_encode_fh(real, (void *)fh->fb.fid, &dwords, 0); + fh_type = exportfs_encode_inode_fh(realinode, (void *)fh->fb.fid, + &dwords, NULL, 0); buflen = (dwords << 2); err = -EIO; - if (WARN_ON(fh_type < 0) || - WARN_ON(buflen > MAX_HANDLE_SZ) || - WARN_ON(fh_type == FILEID_INVALID)) + if (fh_type < 0 || fh_type == FILEID_INVALID || + WARN_ON(buflen > MAX_HANDLE_SZ)) goto out_err; fh->fb.version = OVL_FH_VERSION; @@ -480,7 +480,7 @@ struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin) if (!ovl_can_decode_fh(origin->d_sb)) return NULL; - return ovl_encode_real_fh(ofs, origin, false); + return ovl_encode_real_fh(ofs, d_inode(origin), false); } int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh, @@ -505,7 +505,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper, const struct ovl_fh *fh; int err; - fh = ovl_encode_real_fh(ofs, upper, true); + fh = ovl_encode_real_fh(ofs, d_inode(upper), true); if (IS_ERR(fh)) return PTR_ERR(fh); diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 5868cb222955..444aeeccb6da 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -176,35 +176,37 @@ static int ovl_connect_layer(struct dentry *dentry) * * Return 0 for upper file handle, > 0 for lower file handle or < 0 on error. */ -static int ovl_check_encode_origin(struct dentry *dentry) +static int ovl_check_encode_origin(struct inode *inode) { - struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + struct ovl_fs *ofs = OVL_FS(inode->i_sb); bool decodable = ofs->config.nfs_export; + struct dentry *dentry; + int err; /* No upper layer? */ if (!ovl_upper_mnt(ofs)) return 1; /* Lower file handle for non-upper non-decodable */ - if (!ovl_dentry_upper(dentry) && !decodable) + if (!ovl_inode_upper(inode) && !decodable) return 1; /* Upper file handle for pure upper */ - if (!ovl_dentry_lower(dentry)) + if (!ovl_inode_lower(inode)) return 0; /* * Root is never indexed, so if there's an upper layer, encode upper for * root. */ - if (dentry == dentry->d_sb->s_root) + if (inode == d_inode(inode->i_sb->s_root)) return 0; /* * Upper decodable file handle for non-indexed upper. */ - if (ovl_dentry_upper(dentry) && decodable && - !ovl_test_flag(OVL_INDEX, d_inode(dentry))) + if (ovl_inode_upper(inode) && decodable && + !ovl_test_flag(OVL_INDEX, inode)) return 0; /* @@ -213,14 +215,23 @@ static int ovl_check_encode_origin(struct dentry *dentry) * ovl_connect_layer() will try to make origin's layer "connected" by * copying up a "connectable" ancestor. */ - if (d_is_dir(dentry) && decodable) - return ovl_connect_layer(dentry); + if (!decodable || !S_ISDIR(inode->i_mode)) + return 1; + + dentry = d_find_any_alias(inode); + if (!dentry) + return -ENOENT; + + err = ovl_connect_layer(dentry); + dput(dentry); + if (err < 0) + return err; /* Lower file handle for indexed and non-upper dir/non-dir */ return 1; } -static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry, +static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct inode *inode, u32 *fid, int buflen) { struct ovl_fh *fh = NULL; @@ -231,13 +242,13 @@ static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry, * Check if we should encode a lower or upper file handle and maybe * copy up an ancestor to make lower file handle connectable. */ - err = enc_lower = ovl_check_encode_origin(dentry); + err = enc_lower = ovl_check_encode_origin(inode); if (enc_lower < 0) goto fail; /* Encode an upper or lower file handle */ - fh = ovl_encode_real_fh(ofs, enc_lower ? ovl_dentry_lower(dentry) : - ovl_dentry_upper(dentry), !enc_lower); + fh = ovl_encode_real_fh(ofs, enc_lower ? ovl_inode_lower(inode) : + ovl_inode_upper(inode), !enc_lower); if (IS_ERR(fh)) return PTR_ERR(fh); @@ -251,8 +262,8 @@ out: return err; fail: - pr_warn_ratelimited("failed to encode file handle (%pd2, err=%i)\n", - dentry, err); + pr_warn_ratelimited("failed to encode file handle (ino=%lu, err=%i)\n", + inode->i_ino, err); goto out; } @@ -260,19 +271,13 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, struct inode *parent) { struct ovl_fs *ofs = OVL_FS(inode->i_sb); - struct dentry *dentry; int bytes, buflen = *max_len << 2; /* TODO: encode connectable file handles */ if (parent) return FILEID_INVALID; - dentry = d_find_any_alias(inode); - if (!dentry) - return FILEID_INVALID; - - bytes = ovl_dentry_to_fid(ofs, dentry, fid, buflen); - dput(dentry); + bytes = ovl_dentry_to_fid(ofs, inode, fid, buflen); if (bytes <= 0) return FILEID_INVALID; diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 7e27b7d4adee..cea820cb3b55 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -542,7 +542,7 @@ int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry, struct ovl_fh *fh; int err; - fh = ovl_encode_real_fh(ofs, real, is_upper); + fh = ovl_encode_real_fh(ofs, d_inode(real), is_upper); err = PTR_ERR(fh); if (IS_ERR(fh)) { fh = NULL; @@ -738,7 +738,7 @@ int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, struct ovl_fh *fh; int err; - fh = ovl_encode_real_fh(ofs, origin, false); + fh = ovl_encode_real_fh(ofs, d_inode(origin), false); if (IS_ERR(fh)) return PTR_ERR(fh); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index b361f35762be..0021e2025020 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -865,7 +865,7 @@ int ovl_copy_up_with_data(struct dentry *dentry); int ovl_maybe_copy_up(struct dentry *dentry, int flags); int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentry *new); int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat); -struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, +struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct inode *realinode, bool is_upper); struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin); int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 38a5a3e9cba2..f02cd362309a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1810,7 +1810,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, } for (; addr != end; addr += PAGE_SIZE, idx++) { - unsigned long cur_flags = flags; + u64 cur_flags = flags; pagemap_entry_t pme; if (folio && (flags & PM_PRESENT) && diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 85925ec0051a..3310d1ad4d0e 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -179,8 +179,7 @@ static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf) */ static const char *qnx6_checkroot(struct super_block *s) { - static char match_root[2][3] = {".\0\0", "..\0"}; - int i, error = 0; + int error = 0; struct qnx6_dir_entry *dir_entry; struct inode *root = d_inode(s->s_root); struct address_space *mapping = root->i_mapping; @@ -189,11 +188,9 @@ static const char *qnx6_checkroot(struct super_block *s) if (IS_ERR(folio)) return "error reading root directory"; dir_entry = kmap_local_folio(folio, 0); - for (i = 0; i < 2; i++) { - /* maximum 3 bytes - due to match_root limitation */ - if (strncmp(dir_entry[i].de_fname, match_root[i], 3)) - error = 1; - } + if (memcmp(dir_entry[0].de_fname, ".", 2) || + memcmp(dir_entry[1].de_fname, "..", 3)) + error = 1; folio_release_kmap(folio, dir_entry); if (error) return "error reading root directory."; diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index bd42a419458e..6cb1e81993f8 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1319,14 +1319,16 @@ cifs_readv_callback(struct mid_q_entry *mid) } if (rdata->result == -ENODATA) { - __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); } else { size_t trans = rdata->subreq.transferred + rdata->got_bytes; if (trans < rdata->subreq.len && rdata->subreq.start + trans == ictx->remote_i_size) { - __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); + } else if (rdata->got_bytes > 0) { + __set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags); } } @@ -1670,10 +1672,13 @@ cifs_writev_callback(struct mid_q_entry *mid) if (written > wdata->subreq.len) written &= 0xFFFF; - if (written < wdata->subreq.len) + if (written < wdata->subreq.len) { result = -ENOSPC; - else + } else { result = written; + if (written > 0) + __set_bit(NETFS_SREQ_MADE_PROGRESS, &wdata->subreq.flags); + } break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 959359301250..0577556f0a41 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4615,6 +4615,7 @@ smb2_readv_callback(struct mid_q_entry *mid) __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; } + __set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags); } trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value, server->credits, server->in_flight, @@ -4842,10 +4843,12 @@ smb2_writev_callback(struct mid_q_entry *mid) cifs_stats_bytes_written(tcon, written); - if (written < wdata->subreq.len) + if (written < wdata->subreq.len) { wdata->result = -ENOSPC; - else + } else if (written > 0) { wdata->subreq.len = written; + __set_bit(NETFS_SREQ_MADE_PROGRESS, &wdata->subreq.flags); + } break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: @@ -5014,7 +5017,7 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) } #endif - if (test_bit(NETFS_SREQ_RETRYING, &wdata->subreq.flags)) + if (wdata->subreq.retry_count > 0) smb2_set_replay(server, &rqst); cifs_dbg(FYI, "async write at %llu %u bytes iter=%zx\n", diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index f711bfd75c4d..4bf70cfec826 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -412,6 +412,29 @@ struct ethtool_eth_phy_stats { ); }; +/** + * struct ethtool_phy_stats - PHY-level statistics counters + * @rx_packets: Total successfully received frames + * @rx_bytes: Total successfully received bytes + * @rx_errors: Total received frames with errors (e.g., CRC errors) + * @tx_packets: Total successfully transmitted frames + * @tx_bytes: Total successfully transmitted bytes + * @tx_errors: Total transmitted frames with errors + * + * This structure provides a standardized interface for reporting + * PHY-level statistics counters. It is designed to expose statistics + * commonly provided by PHYs but not explicitly defined in the IEEE + * 802.3 standard. + */ +struct ethtool_phy_stats { + u64 rx_packets; + u64 rx_bytes; + u64 rx_errors; + u64 tx_packets; + u64 tx_bytes; + u64 tx_errors; +}; + /* Basic IEEE 802.3 MAC Ctrl statistics (30.3.3.*), not otherwise exposed * via a more targeted API. */ diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index d495cbdb52cb..38456b42cdb5 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -176,6 +176,7 @@ struct netpoll; * @real_dev_addr: address of underlying netdevice * @dent: proc dir entry * @vlan_pcpu_stats: ptr to percpu rx stats + * @netpoll: netpoll instance "propagated" down to @real_dev */ struct vlan_dev_priv { unsigned int nr_ingress_mappings; @@ -414,6 +415,8 @@ static inline int __vlan_insert_tag(struct sk_buff *skb, * doesn't have to worry about freeing the original skb. * * Does not change skb->protocol so this function can be used during receive. + * + * Return: modified @skb on success, NULL on error (@skb is freed). */ static inline struct sk_buff *vlan_insert_inner_tag(struct sk_buff *skb, __be16 vlan_proto, @@ -443,6 +446,8 @@ static inline struct sk_buff *vlan_insert_inner_tag(struct sk_buff *skb, * doesn't have to worry about freeing the original skb. * * Does not change skb->protocol so this function can be used during receive. + * + * Return: modified @skb on success, NULL on error (@skb is freed). */ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) @@ -461,6 +466,8 @@ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb, * * Following the skb_unshare() example, in case of error, the calling function * doesn't have to worry about freeing the original skb. + * + * Return: modified @skb on success, NULL on error (@skb is freed). */ static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb, __be16 vlan_proto, @@ -582,7 +589,7 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) } /** - * vlan_get_protocol - get protocol EtherType. + * __vlan_get_protocol_offset() - get protocol EtherType. * @skb: skbuff to query * @type: first vlan protocol * @mac_offset: MAC offset @@ -808,9 +815,11 @@ static inline netdev_features_t vlan_features_check(struct sk_buff *skb, * @h1: Pointer to vlan header * @h2: Pointer to vlan header * - * Compare two vlan headers, returns 0 if equal. + * Compare two vlan headers. * * Please note that alignment of h1 & h2 are only guaranteed to be 16 bits. + * + * Return: 0 if equal, arbitrary non-zero value if not equal. */ static inline unsigned long compare_vlan_header(const struct vlan_hdr *h1, const struct vlan_hdr *h2) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 0d5448c0b86c..a3ce553413de 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -18,6 +18,11 @@ struct io_uring_cmd { u8 pdu[32]; /* available inline for free use */ }; +struct io_uring_cmd_data { + struct io_uring_sqe sqes[2]; + void *op_data; +}; + static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) { return sqe->cmd; @@ -113,4 +118,9 @@ static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd return cmd_to_io_kiocb(cmd)->tctx->task; } +static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_uring_cmd *cmd) +{ + return cmd_to_io_kiocb(cmd)->async_data; +} + #endif /* _LINUX_IO_URING_CMD_H */ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 5675af6b740c..75bf54e76f3b 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -335,7 +335,7 @@ struct iomap_ioend { u16 io_type; u16 io_flags; /* IOMAP_F_* */ struct inode *io_inode; /* file being written to */ - size_t io_size; /* size of the extent */ + size_t io_size; /* size of data within eof */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ struct bio io_bio; /* MUST BE LAST! */ diff --git a/include/linux/memfd.h b/include/linux/memfd.h index 3f2cf339ceaf..d437e3070850 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -7,6 +7,7 @@ #ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); +unsigned int *memfd_file_seals_ptr(struct file *file); #else static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) { @@ -16,6 +17,19 @@ static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) { return ERR_PTR(-EINVAL); } + +static inline unsigned int *memfd_file_seals_ptr(struct file *file) +{ + return NULL; +} #endif +/* Retrieve memfd seals associated with the file, if any. */ +static inline unsigned int memfd_file_seals(struct file *file) +{ + unsigned int *sealsp = memfd_file_seals_ptr(file); + + return sealsp ? *sealsp : 0; +} + #endif /* __LINUX_MEMFD_H */ diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index ea48eb879a0f..fed666c5bd16 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -691,7 +691,6 @@ struct mlx5_timer { struct timecounter tc; u32 nominal_c_mult; unsigned long overflow_period; - struct delayed_work overflow_work; }; struct mlx5_clock { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 370f533da107..bb99a35fc6a2 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -7025,6 +7025,7 @@ struct mlx5_ifc_alloc_packet_reformat_context_out_bits { enum { MLX5_REFORMAT_CONTEXT_ANCHOR_MAC_START = 0x1, + MLX5_REFORMAT_CONTEXT_ANCHOR_VLAN_START = 0x2, MLX5_REFORMAT_CONTEXT_ANCHOR_IP_START = 0x7, MLX5_REFORMAT_CONTEXT_ANCHOR_TCP_UDP_START = 0x9, }; diff --git a/include/linux/mm.h b/include/linux/mm.h index 338a76ce9083..b1c3db9cf355 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3125,6 +3125,7 @@ static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) if (!pmd_ptlock_init(ptdesc)) return false; __folio_set_pgtable(folio); + ptdesc_pmd_pts_init(ptdesc); lruvec_stat_add_folio(folio, NR_PAGETABLE); return true; } @@ -4101,6 +4102,37 @@ void mem_dump_obj(void *object); static inline void mem_dump_obj(void *object) {} #endif +static inline bool is_write_sealed(int seals) +{ + return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); +} + +/** + * is_readonly_sealed - Checks whether write-sealed but mapped read-only, + * in which case writes should be disallowing moving + * forwards. + * @seals: the seals to check + * @vm_flags: the VMA flags to check + * + * Returns whether readonly sealed, in which case writess should be disallowed + * going forward. + */ +static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags) +{ + /* + * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as + * MAP_SHARED and read-only, take care to not allow mprotect to + * revert protections on such mappings. Do this only for shared + * mappings. For private mappings, don't need to mask + * VM_MAYWRITE as we still want them to be COW-writable. + */ + if (is_write_sealed(seals) && + ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED)) + return true; + + return false; +} + /** * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and * handle them. @@ -4112,24 +4144,15 @@ static inline void mem_dump_obj(void *object) {} */ static inline int seal_check_write(int seals, struct vm_area_struct *vma) { - if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { - /* - * New PROT_WRITE and MAP_SHARED mmaps are not allowed when - * write seals are active. - */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return -EPERM; - - /* - * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as - * MAP_SHARED and read-only, take care to not allow mprotect to - * revert protections on such mappings. Do this only for shared - * mappings. For private mappings, don't need to mask - * VM_MAYWRITE as we still want them to be COW-writable. - */ - if (vma->vm_flags & VM_SHARED) - vm_flags_clear(vma, VM_MAYWRITE); - } + if (!is_write_sealed(seals)) + return 0; + + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * write seals are active. + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return -EPERM; return 0; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7361a8f3ab68..332cee285662 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -445,6 +445,7 @@ FOLIO_MATCH(compound_head, _head_2a); * @pt_index: Used for s390 gmap. * @pt_mm: Used for x86 pgds. * @pt_frag_refcount: For fragmented page table tracking. Powerpc only. + * @pt_share_count: Used for HugeTLB PMD page table share count. * @_pt_pad_2: Padding to ensure proper alignment. * @ptl: Lock for the page table. * @__page_type: Same as page->page_type. Unused for page tables. @@ -471,6 +472,9 @@ struct ptdesc { pgoff_t pt_index; struct mm_struct *pt_mm; atomic_t pt_frag_refcount; +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING + atomic_t pt_share_count; +#endif }; union { @@ -516,6 +520,32 @@ static_assert(sizeof(struct ptdesc) <= sizeof(struct page)); const struct page *: (const struct ptdesc *)(p), \ struct page *: (struct ptdesc *)(p))) +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING +static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) +{ + atomic_set(&ptdesc->pt_share_count, 0); +} + +static inline void ptdesc_pmd_pts_inc(struct ptdesc *ptdesc) +{ + atomic_inc(&ptdesc->pt_share_count); +} + +static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc) +{ + atomic_dec(&ptdesc->pt_share_count); +} + +static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc) +{ + return atomic_read(&ptdesc->pt_share_count); +} +#else +static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) +{ +} +#endif + /* * Used for sizing the vmemmap region on some architectures */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2593019ad5b1..dd8f6f8991fe 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2261,7 +2261,7 @@ struct net_device { void *atalk_ptr; #endif #if IS_ENABLED(CONFIG_AX25) - void *ax25_ptr; + struct ax25_dev __rcu *ax25_ptr; #endif #if IS_ENABLED(CONFIG_CFG80211) struct wireless_dev *ieee80211_ptr; @@ -3238,7 +3238,6 @@ static inline void unregister_netdevice(struct net_device *dev) int netdev_refcnt_read(const struct net_device *dev); void free_netdev(struct net_device *dev); -void init_dummy_netdev(struct net_device *dev); struct net_device *netdev_get_xmit_slave(struct net_device *dev, struct sk_buff *skb, @@ -3252,7 +3251,6 @@ struct net_device *netdev_get_by_index(struct net *net, int ifindex, struct net_device *netdev_get_by_name(struct net *net, const char *name, netdevice_tracker *tracker, gfp_t gfp); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); -struct net_device *dev_get_by_napi_id(unsigned int napi_id); void netdev_copy_name(struct net_device *dev, char *name); static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, @@ -4295,7 +4293,7 @@ static inline bool netif_carrier_ok(const struct net_device *dev) unsigned long dev_trans_start(struct net_device *dev); -void __netdev_watchdog_up(struct net_device *dev); +void netdev_watchdog_up(struct net_device *dev); void netif_carrier_on(struct net_device *dev); void netif_carrier_off(struct net_device *dev); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 5eaceef41e6c..ecdd5ced16a8 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -185,6 +185,7 @@ struct netfs_io_subrequest { short error; /* 0 or error that occurred */ unsigned short debug_index; /* Index in list (for debugging output) */ unsigned int nr_segs; /* Number of segs in io_iter */ + u8 retry_count; /* The number of retries (0 on initial pass) */ enum netfs_io_source source; /* Where to read from/write to */ unsigned char stream_nr; /* I/O stream this belongs to */ unsigned char curr_folioq_slot; /* Folio currently being read */ @@ -194,14 +195,13 @@ struct netfs_io_subrequest { #define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */ #define NETFS_SREQ_CLEAR_TAIL 1 /* Set if the rest of the read should be cleared */ #define NETFS_SREQ_SEEK_DATA_READ 3 /* Set if ->read() should SEEK_DATA first */ -#define NETFS_SREQ_NO_PROGRESS 4 /* Set if we didn't manage to read any data */ +#define NETFS_SREQ_MADE_PROGRESS 4 /* Set if we transferred at least some data */ #define NETFS_SREQ_ONDEMAND 5 /* Set if it's from on-demand read mode */ #define NETFS_SREQ_BOUNDARY 6 /* Set if ends on hard boundary (eg. ceph object) */ #define NETFS_SREQ_HIT_EOF 7 /* Set if short due to EOF */ #define NETFS_SREQ_IN_PROGRESS 8 /* Unlocked when the subrequest completes */ #define NETFS_SREQ_NEED_RETRY 9 /* Set if the filesystem requests a retry */ -#define NETFS_SREQ_RETRYING 10 /* Set if we're retrying */ -#define NETFS_SREQ_FAILED 11 /* Set if the subreq failed unretryably */ +#define NETFS_SREQ_FAILED 10 /* Set if the subreq failed unretryably */ }; enum netfs_io_origin { @@ -269,7 +269,6 @@ struct netfs_io_request { size_t prev_donated; /* Fallback for subreq->prev_donated */ refcount_t ref; unsigned long flags; -#define NETFS_RREQ_COPY_TO_CACHE 1 /* Need to write to the cache */ #define NETFS_RREQ_NO_UNLOCK_FOLIO 2 /* Don't unlock no_unlock_folio on completion */ #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ #define NETFS_RREQ_FAILED 4 /* The request failed */ diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index b5b5d17998b8..733f4ddd2ef1 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -50,7 +50,6 @@ struct dw_xpcs; struct phylink_pcs *xpcs_to_phylink_pcs(struct dw_xpcs *xpcs); int xpcs_get_an_mode(struct dw_xpcs *xpcs, phy_interface_t interface); -void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces); int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable); struct dw_xpcs *xpcs_create_mdiodev(struct mii_bus *bus, int addr); diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 35842d1e3879..5b520fe86b60 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -221,10 +221,7 @@ do { \ } while (0) #define PERCPU_PTR(__p) \ -({ \ - unsigned long __pcpu_ptr = (__force unsigned long)(__p); \ - (typeof(*(__p)) __force __kernel *)(__pcpu_ptr); \ -}) + (typeof(*(__p)) __force __kernel *)((__force unsigned long)(__p)) #ifdef CONFIG_SMP diff --git a/include/linux/phy.h b/include/linux/phy.h index 5bc71d59910c..afaae74d0949 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1144,6 +1144,53 @@ struct phy_driver { int (*cable_test_get_status)(struct phy_device *dev, bool *finished); /* Get statistics from the PHY using ethtool */ + /** + * @get_phy_stats: Retrieve PHY statistics. + * @dev: The PHY device for which the statistics are retrieved. + * @eth_stats: structure where Ethernet PHY stats will be stored. + * @stats: structure where additional PHY-specific stats will be stored. + * + * Retrieves the supported PHY statistics and populates the provided + * structures. The input structures are pre-initialized with + * `ETHTOOL_STAT_NOT_SET`, and the driver must only modify members + * corresponding to supported statistics. Unmodified members will remain + * set to `ETHTOOL_STAT_NOT_SET` and will not be returned to userspace. + */ + void (*get_phy_stats)(struct phy_device *dev, + struct ethtool_eth_phy_stats *eth_stats, + struct ethtool_phy_stats *stats); + + /** + * @get_link_stats: Retrieve link statistics. + * @dev: The PHY device for which the statistics are retrieved. + * @link_stats: structure where link-specific stats will be stored. + * + * Retrieves link-related statistics for the given PHY device. The input + * structure is pre-initialized with `ETHTOOL_STAT_NOT_SET`, and the + * driver must only modify members corresponding to supported + * statistics. Unmodified members will remain set to + * `ETHTOOL_STAT_NOT_SET` and will not be returned to userspace. + */ + void (*get_link_stats)(struct phy_device *dev, + struct ethtool_link_ext_stats *link_stats); + + /** + * @update_stats: Trigger periodic statistics updates. + * @dev: The PHY device for which statistics updates are triggered. + * + * Periodically gathers statistics from the PHY device to update locally + * maintained 64-bit counters. This is necessary for PHYs that implement + * reduced-width counters (e.g., 16-bit or 32-bit) which can overflow + * more frequently compared to 64-bit counters. By invoking this + * callback, drivers can fetch the current counter values, handle + * overflow detection, and accumulate the results into local 64-bit + * counters for accurate reporting through the `get_phy_stats` and + * `get_link_stats` interfaces. + * + * Return: 0 on success or a negative error code on failure. + */ + int (*update_stats)(struct phy_device *dev); + /** @get_sset_count: Number of statistic counters */ int (*get_sset_count)(struct phy_device *dev); /** @get_strings: Names of the statistic counters */ @@ -1634,6 +1681,9 @@ static inline bool phy_polling_mode(struct phy_device *phydev) if (phydev->drv->flags & PHY_POLL_CABLE_TEST) return true; + if (phydev->drv->update_stats) + return true; + return phydev->irq == PHY_POLL; } @@ -2096,6 +2146,7 @@ int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask); int phy_unregister_fixup_for_id(const char *bus_id); int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask); +int phy_eee_rx_clock_stop(struct phy_device *phydev, bool clk_stop_enable); int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable); int phy_get_eee_err(struct phy_device *phydev); int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_keee *data); @@ -2123,6 +2174,13 @@ int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data); int phy_ethtool_get_sset_count(struct phy_device *phydev); int phy_ethtool_get_stats(struct phy_device *phydev, struct ethtool_stats *stats, u64 *data); + +void __phy_ethtool_get_phy_stats(struct phy_device *phydev, + struct ethtool_eth_phy_stats *phy_stats, + struct ethtool_phy_stats *phydev_stats); +void __phy_ethtool_get_link_ext_stats(struct phy_device *phydev, + struct ethtool_link_ext_stats *link_stats); + int phy_ethtool_get_plca_cfg(struct phy_device *phydev, struct phy_plca_cfg *plca_cfg); int phy_ethtool_set_plca_cfg(struct phy_device *phydev, diff --git a/include/linux/phylib_stubs.h b/include/linux/phylib_stubs.h index 1279f48c8a70..9d2d6090c86d 100644 --- a/include/linux/phylib_stubs.h +++ b/include/linux/phylib_stubs.h @@ -5,6 +5,9 @@ #include <linux/rtnetlink.h> +struct ethtool_eth_phy_stats; +struct ethtool_link_ext_stats; +struct ethtool_phy_stats; struct kernel_hwtstamp_config; struct netlink_ext_ack; struct phy_device; @@ -19,6 +22,11 @@ struct phylib_stubs { int (*hwtstamp_set)(struct phy_device *phydev, struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack); + void (*get_phy_stats)(struct phy_device *phydev, + struct ethtool_eth_phy_stats *phy_stats, + struct ethtool_phy_stats *phydev_stats); + void (*get_link_ext_stats)(struct phy_device *phydev, + struct ethtool_link_ext_stats *link_stats); }; static inline int phy_hwtstamp_get(struct phy_device *phydev, @@ -50,6 +58,29 @@ static inline int phy_hwtstamp_set(struct phy_device *phydev, return phylib_stubs->hwtstamp_set(phydev, config, extack); } +static inline void phy_ethtool_get_phy_stats(struct phy_device *phydev, + struct ethtool_eth_phy_stats *phy_stats, + struct ethtool_phy_stats *phydev_stats) +{ + ASSERT_RTNL(); + + if (!phylib_stubs) + return; + + phylib_stubs->get_phy_stats(phydev, phy_stats, phydev_stats); +} + +static inline void phy_ethtool_get_link_ext_stats(struct phy_device *phydev, + struct ethtool_link_ext_stats *link_stats) +{ + ASSERT_RTNL(); + + if (!phylib_stubs) + return; + + phylib_stubs->get_link_ext_stats(phydev, link_stats); +} + #else static inline int phy_hwtstamp_get(struct phy_device *phydev, @@ -65,4 +96,15 @@ static inline int phy_hwtstamp_set(struct phy_device *phydev, return -EOPNOTSUPP; } +static inline void phy_ethtool_get_phy_stats(struct phy_device *phydev, + struct ethtool_eth_phy_stats *phy_stats, + struct ethtool_phy_stats *phydev_stats) +{ +} + +static inline void phy_ethtool_get_link_ext_stats(struct phy_device *phydev, + struct ethtool_link_ext_stats *link_stats) +{ +} + #endif diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 5462cc6a37dc..4b7a20620b49 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -393,6 +393,8 @@ struct phylink_pcs_ops; /** * struct phylink_pcs - PHYLINK PCS instance + * @supported_interfaces: describing which PHY_INTERFACE_MODE_xxx + * are supported by this PCS. * @ops: a pointer to the &struct phylink_pcs_ops structure * @phylink: pointer to &struct phylink_config * @neg_mode: provide PCS neg mode via "mode" argument @@ -409,6 +411,7 @@ struct phylink_pcs_ops; * the PCS driver. */ struct phylink_pcs { + DECLARE_PHY_INTERFACE_MASK(supported_interfaces); const struct phylink_pcs_ops *ops; struct phylink *phylink; bool neg_mode; diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 3b9d132cbc9e..4bc2ee0b10b0 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -102,6 +102,7 @@ void __rtnl_net_unlock(struct net *net); void rtnl_net_lock(struct net *net); void rtnl_net_unlock(struct net *net); int rtnl_net_trylock(struct net *net); +int rtnl_net_lock_killable(struct net *net); int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b); bool rtnl_net_is_locked(struct net *net); @@ -138,6 +139,11 @@ static inline int rtnl_net_trylock(struct net *net) return rtnl_trylock(); } +static inline int rtnl_net_lock_killable(struct net *net) +{ + return rtnl_lock_killable(); +} + static inline void ASSERT_RTNL_NET(struct net *net) { ASSERT_RTNL(); diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 58337898fa21..f8f91b2038ea 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -546,4 +546,7 @@ int inet6_fill_ifmcaddr(struct sk_buff *skb, const struct ifmcaddr6 *ifmca, struct inet6_fill_args *args); +int inet6_fill_ifacaddr(struct sk_buff *skb, + const struct ifacaddr6 *ifaca, + struct inet6_fill_args *args); #endif diff --git a/include/net/ax25.h b/include/net/ax25.h index cb622d84cd0c..4ee141aae0a2 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -231,6 +231,7 @@ typedef struct ax25_dev { #endif refcount_t refcount; bool device_up; + struct rcu_head rcu; } ax25_dev; typedef struct ax25_cb { @@ -290,9 +291,8 @@ static inline void ax25_dev_hold(ax25_dev *ax25_dev) static inline void ax25_dev_put(ax25_dev *ax25_dev) { - if (refcount_dec_and_test(&ax25_dev->refcount)) { - kfree(ax25_dev); - } + if (refcount_dec_and_test(&ax25_dev->refcount)) + kfree_rcu(ax25_dev, rcu); } static inline __be16 ax25_type_trans(struct sk_buff *skb, struct net_device *dev) { @@ -335,9 +335,9 @@ void ax25_digi_invert(const ax25_digi *, ax25_digi *); extern spinlock_t ax25_dev_lock; #if IS_ENABLED(CONFIG_AX25) -static inline ax25_dev *ax25_dev_ax25dev(struct net_device *dev) +static inline ax25_dev *ax25_dev_ax25dev(const struct net_device *dev) { - return dev->ax25_ptr; + return rcu_dereference_rtnl(dev->ax25_ptr); } #endif diff --git a/include/net/dsa.h b/include/net/dsa.h index 4aeedb296d67..9640d5c67f56 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -991,8 +991,6 @@ struct dsa_switch_ops { bool (*support_eee)(struct dsa_switch *ds, int port); int (*set_mac_eee)(struct dsa_switch *ds, int port, struct ethtool_keee *e); - int (*get_mac_eee)(struct dsa_switch *ds, int port, - struct ethtool_keee *e); /* EEPROM access */ int (*get_eeprom_len)(struct dsa_switch *ds); diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 3c82fad904d4..c7f42844c79a 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -282,7 +282,7 @@ static inline int inet_csk_reqsk_queue_len(const struct sock *sk) static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) { - return inet_csk_reqsk_queue_len(sk) >= READ_ONCE(sk->sk_max_ack_backlog); + return inet_csk_reqsk_queue_len(sk) > READ_ONCE(sk->sk_max_ack_backlog); } bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 3ccbad881d74..1086256549fa 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -19,6 +19,7 @@ #include <linux/netdevice.h> #include <net/flow.h> +#include <net/inet_dscp.h> #include <net/sock.h> #include <net/request_sock.h> #include <net/netns/hash.h> @@ -302,6 +303,11 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet) return READ_ONCE(inet->inet_flags) & IP_CMSG_ALL; } +static inline dscp_t inet_sk_dscp(const struct inet_sock *inet) +{ + return inet_dsfield_to_dscp(READ_ONCE(inet->tos)); +} + #define inet_test_bit(nr, sk) \ test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) #define inet_set_bit(nr, sk) \ diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 32c09e85a64c..4b0677e48190 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -38,6 +38,7 @@ #define XFRM_PROTO_COMP 108 #define XFRM_PROTO_IPIP 4 #define XFRM_PROTO_IPV6 41 +#define XFRM_PROTO_IPTFS IPPROTO_AGGFRAG #define XFRM_PROTO_ROUTING IPPROTO_ROUTING #define XFRM_PROTO_DSTOPTS IPPROTO_DSTOPTS @@ -213,6 +214,7 @@ struct xfrm_state { u16 family; xfrm_address_t saddr; int header_len; + int enc_hdr_len; int trailer_len; u32 extra_flags; struct xfrm_mark smark; @@ -303,6 +305,9 @@ struct xfrm_state { * interpreted by xfrm_type methods. */ void *data; u8 dir; + + const struct xfrm_mode_cbs *mode_cbs; + void *mode_data; }; static inline struct net *xs_net(struct xfrm_state *x) @@ -460,6 +465,45 @@ struct xfrm_type_offload { int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family); void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); +/** + * struct xfrm_mode_cbs - XFRM mode callbacks + * @owner: module owner or NULL + * @init_state: Add/init mode specific state in `xfrm_state *x` + * @clone_state: Copy mode specific values from `orig` to new state `x` + * @destroy_state: Cleanup mode specific state from `xfrm_state *x` + * @user_init: Process mode specific netlink attributes from user + * @copy_to_user: Add netlink attributes to `attrs` based on state in `x` + * @sa_len: Return space required to store mode specific netlink attributes + * @get_inner_mtu: Return avail payload space after removing encap overhead + * @input: Process received packet from SA using mode + * @output: Output given packet using mode + * @prepare_output: Add mode specific encapsulation to packet in skb. On return + * `transport_header` should point at ESP header, `network_header` should + * point at outer IP header and `mac_header` should opint at the + * protocol/nexthdr field of the outer IP. + * + * One should examine and understand the specific uses of these callbacks in + * xfrm for further detail on how and when these functions are called. RTSL. + */ +struct xfrm_mode_cbs { + struct module *owner; + int (*init_state)(struct xfrm_state *x); + int (*clone_state)(struct xfrm_state *x, struct xfrm_state *orig); + void (*destroy_state)(struct xfrm_state *x); + int (*user_init)(struct net *net, struct xfrm_state *x, + struct nlattr **attrs, + struct netlink_ext_ack *extack); + int (*copy_to_user)(struct xfrm_state *x, struct sk_buff *skb); + unsigned int (*sa_len)(const struct xfrm_state *x); + u32 (*get_inner_mtu)(struct xfrm_state *x, int outer_mtu); + int (*input)(struct xfrm_state *x, struct sk_buff *skb); + int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); + int (*prepare_output)(struct xfrm_state *x, struct sk_buff *skb); +}; + +int xfrm_register_mode_cbs(u8 mode, const struct xfrm_mode_cbs *mode_cbs); +void xfrm_unregister_mode_cbs(u8 mode); + static inline int xfrm_af2proto(unsigned int family) { switch(family) { diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 7e1b3820f91f..d1089b88efc7 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -681,6 +681,7 @@ enum ethtool_link_ext_substate_module { * @ETH_SS_STATS_ETH_MAC: names of IEEE 802.3 MAC statistics * @ETH_SS_STATS_ETH_CTRL: names of IEEE 802.3 MAC Control statistics * @ETH_SS_STATS_RMON: names of RMON statistics + * @ETH_SS_STATS_PHY: names of PHY(dev) statistics * * @ETH_SS_COUNT: number of defined string sets */ @@ -706,6 +707,7 @@ enum ethtool_stringset { ETH_SS_STATS_ETH_MAC, ETH_SS_STATS_ETH_CTRL, ETH_SS_STATS_RMON, + ETH_SS_STATS_PHY, /* add new constants above here */ ETH_SS_COUNT diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 9c909ce733a5..9ff72cfb2e98 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -99,6 +99,7 @@ enum { ETHTOOL_STATS_ETH_MAC, ETHTOOL_STATS_ETH_CTRL, ETHTOOL_STATS_RMON, + ETHTOOL_STATS_PHY, /* add new constants above here */ __ETHTOOL_STATS_CNT @@ -193,6 +194,19 @@ enum { ETHTOOL_A_STATS_RMON_MAX = (__ETHTOOL_A_STATS_RMON_CNT - 1) }; +enum { + /* Basic packet counters if PHY has separate counters from the MAC */ + ETHTOOL_A_STATS_PHY_RX_PKTS, + ETHTOOL_A_STATS_PHY_RX_BYTES, + ETHTOOL_A_STATS_PHY_RX_ERRORS, + ETHTOOL_A_STATS_PHY_TX_PKTS, + ETHTOOL_A_STATS_PHY_TX_BYTES, + ETHTOOL_A_STATS_PHY_TX_ERRORS, + + /* add new constants above here */ + __ETHTOOL_A_STATS_PHY_CNT, + ETHTOOL_A_STATS_PHY_MAX = (__ETHTOOL_A_STATS_PHY_CNT - 1) +}; /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 77730c340c8f..bfe880fbbb24 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1315,6 +1315,8 @@ enum { IFLA_NETKIT_MODE, IFLA_NETKIT_SCRUB, IFLA_NETKIT_PEER_SCRUB, + IFLA_NETKIT_HEADROOM, + IFLA_NETKIT_TAILROOM, __IFLA_NETKIT_MAX, }; #define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 5d32d53508d9..ced0fc3c3aa5 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -79,6 +79,8 @@ enum { #define IPPROTO_MPLS IPPROTO_MPLS IPPROTO_ETHERNET = 143, /* Ethernet-within-IPv6 Encapsulation */ #define IPPROTO_ETHERNET IPPROTO_ETHERNET + IPPROTO_AGGFRAG = 144, /* AGGFRAG in ESP (RFC 9347) */ +#define IPPROTO_AGGFRAG IPPROTO_AGGFRAG IPPROTO_RAW = 255, /* Raw IP packets */ #define IPPROTO_RAW IPPROTO_RAW IPPROTO_SMC = 256, /* Shared Memory Communications */ diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h index 283dec7e3645..5bd7ce934d74 100644 --- a/include/uapi/linux/ip.h +++ b/include/uapi/linux/ip.h @@ -137,6 +137,22 @@ struct ip_beet_phdr { __u8 reserved; }; +struct ip_iptfs_hdr { + __u8 subtype; /* 0*: basic, 1: CC */ + __u8 flags; + __be16 block_offset; +}; + +struct ip_iptfs_cc_hdr { + __u8 subtype; /* 0: basic, 1*: CC */ + __u8 flags; + __be16 block_offset; + __be32 loss_rate; + __be64 rtt_adelay_xdelay; + __be32 tval; + __be32 techo; +}; + /* index values for the variables in ipv4_devconf */ enum { diff --git a/include/uapi/linux/ipsec.h b/include/uapi/linux/ipsec.h index 50d8ee1791e2..696b790f4346 100644 --- a/include/uapi/linux/ipsec.h +++ b/include/uapi/linux/ipsec.h @@ -14,7 +14,8 @@ enum { IPSEC_MODE_ANY = 0, /* We do not support this for SA */ IPSEC_MODE_TRANSPORT = 1, IPSEC_MODE_TUNNEL = 2, - IPSEC_MODE_BEET = 3 + IPSEC_MODE_BEET = 3, + IPSEC_MODE_IPTFS = 4 }; enum { diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 5ee94c511a28..66c3903d29cf 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -100,7 +100,11 @@ enum { RTM_GETMULTICAST, #define RTM_GETMULTICAST RTM_GETMULTICAST - RTM_GETANYCAST = 62, + RTM_NEWANYCAST = 60, +#define RTM_NEWANYCAST RTM_NEWANYCAST + RTM_DELANYCAST, +#define RTM_DELANYCAST RTM_DELANYCAST + RTM_GETANYCAST, #define RTM_GETANYCAST RTM_GETANYCAST RTM_NEWNEIGHTBL = 64, @@ -783,6 +787,8 @@ enum rtnetlink_groups { #define RTNLGRP_IPV4_MCADDR RTNLGRP_IPV4_MCADDR RTNLGRP_IPV6_MCADDR, #define RTNLGRP_IPV6_MCADDR RTNLGRP_IPV6_MCADDR + RTNLGRP_IPV6_ACADDR, +#define RTNLGRP_IPV6_ACADDR RTNLGRP_IPV6_ACADDR __RTNLGRP_MAX }; #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 51da2e00112d..2e75674e7d4f 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -339,6 +339,8 @@ enum LINUX_MIB_XFRMACQUIREERROR, /* XfrmAcquireError */ LINUX_MIB_XFRMOUTSTATEDIRERROR, /* XfrmOutStateDirError */ LINUX_MIB_XFRMINSTATEDIRERROR, /* XfrmInStateDirError */ + LINUX_MIB_XFRMINIPTFSERROR, /* XfrmInIptfsError */ + LINUX_MIB_XFRMOUTNOQSPACE, /* XfrmOutNoQueueSpace */ __LINUX_MIB_XFRMMAX }; diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index d73a97e3030a..a23495c0e0a1 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -158,7 +158,8 @@ enum { #define XFRM_MODE_ROUTEOPTIMIZATION 2 #define XFRM_MODE_IN_TRIGGER 3 #define XFRM_MODE_BEET 4 -#define XFRM_MODE_MAX 5 +#define XFRM_MODE_IPTFS 5 +#define XFRM_MODE_MAX 6 /* Netlink configuration messages. */ enum { @@ -323,6 +324,12 @@ enum xfrm_attr_type_t { XFRMA_SA_DIR, /* __u8 */ XFRMA_NAT_KEEPALIVE_INTERVAL, /* __u32 in seconds for NAT keepalive */ XFRMA_SA_PCPU, /* __u32 */ + XFRMA_IPTFS_DROP_TIME, /* __u32 in: usec to wait for next seq */ + XFRMA_IPTFS_REORDER_WINDOW, /* __u16 in: reorder window size (pkts) */ + XFRMA_IPTFS_DONT_FRAG, /* out: don't use fragmentation */ + XFRMA_IPTFS_INIT_DELAY, /* __u32 out: initial packet wait delay (usec) */ + XFRMA_IPTFS_MAX_QSIZE, /* __u32 out: max ingress queue size (octets) */ + XFRMA_IPTFS_PKT_SIZE, /* __u32 out: size of outer packet, 0 for PMTU */ __XFRMA_MAX #define XFRMA_OUTPUT_MARK XFRMA_SET_MARK /* Compatibility */ diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index d650ae6b58d3..74e5b9960c54 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -329,7 +329,6 @@ struct ufs_pwr_mode_info { * @program_key: program or evict an inline encryption key * @fill_crypto_prdt: initialize crypto-related fields in the PRDT * @event_notify: called to notify important events - * @reinit_notify: called to notify reinit of UFSHCD during max gear switch * @mcq_config_resource: called to configure MCQ platform resources * @get_hba_mac: reports maximum number of outstanding commands supported by * the controller. Should be implemented for UFSHCI 4.0 or later @@ -381,7 +380,6 @@ struct ufs_hba_variant_ops { void *prdt, unsigned int num_segments); void (*event_notify)(struct ufs_hba *hba, enum ufs_event_type evt, void *data); - void (*reinit_notify)(struct ufs_hba *); int (*mcq_config_resource)(struct ufs_hba *hba); int (*get_hba_mac)(struct ufs_hba *hba); int (*op_runtime_config)(struct ufs_hba *hba); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d3403c8216db..ff691f37462c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -320,7 +320,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_rw)); ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct uring_cache)); + sizeof(struct io_uring_cmd_data)); spin_lock_init(&ctx->msg_lock); ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_kiocb)); diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index d407576ddfb7..eec5eb7de843 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -139,6 +139,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, struct io_uring_buf_ring *br = bl->buf_ring; __u16 tail, head = bl->head; struct io_uring_buf *buf; + void __user *ret; tail = smp_load_acquire(&br->tail); if (unlikely(tail == head)) @@ -153,6 +154,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; req->buf_list = bl; req->buf_index = buf->bid; + ret = u64_to_user_ptr(buf->addr); if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) { /* @@ -168,7 +170,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, io_kbuf_commit(req, bl, *len, 1); req->buf_list = NULL; } - return u64_to_user_ptr(buf->addr); + return ret; } void __user *io_buffer_select(struct io_kiocb *req, size_t *len, diff --git a/io_uring/net.c b/io_uring/net.c index df1f7dc6f1c8..c6cd38cc5dc4 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -754,6 +754,7 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req) if (req->opcode == IORING_OP_RECV) { kmsg->msg.msg_name = NULL; kmsg->msg.msg_namelen = 0; + kmsg->msg.msg_inq = 0; kmsg->msg.msg_control = NULL; kmsg->msg.msg_get_inq = 1; kmsg->msg.msg_controllen = 0; diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 3de75eca1c92..e8baef4e5146 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -7,6 +7,7 @@ #include <linux/fs.h> #include <linux/file.h> #include <linux/io_uring.h> +#include <linux/io_uring/cmd.h> #include "io_uring.h" #include "opdef.h" @@ -414,7 +415,7 @@ const struct io_issue_def io_issue_defs[] = { .plug = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = 2 * sizeof(struct io_uring_sqe), + .async_size = sizeof(struct io_uring_cmd_data), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, }, diff --git a/io_uring/rw.c b/io_uring/rw.c index 0bcb83e4ce3c..29bb3010f9c0 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -983,6 +983,8 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) io_kbuf_recycle(req, issue_flags); if (ret < 0) req_set_fail(req); + } else if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { + cflags = io_put_kbuf(req, ret, issue_flags); } else { /* * Any successful return value will keep the multishot read diff --git a/io_uring/timeout.c b/io_uring/timeout.c index bbe58638eca7..362689b17ccc 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -85,7 +85,27 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) io_req_task_complete(req, ts); } -static bool io_kill_timeout(struct io_kiocb *req, int status) +static __cold bool io_flush_killed_timeouts(struct list_head *list, int err) +{ + if (list_empty(list)) + return false; + + while (!list_empty(list)) { + struct io_timeout *timeout; + struct io_kiocb *req; + + timeout = list_first_entry(list, struct io_timeout, list); + list_del_init(&timeout->list); + req = cmd_to_io_kiocb(timeout); + if (err) + req_set_fail(req); + io_req_queue_tw_complete(req, err); + } + + return true; +} + +static void io_kill_timeout(struct io_kiocb *req, struct list_head *list) __must_hold(&req->ctx->timeout_lock) { struct io_timeout_data *io = req->async_data; @@ -93,21 +113,17 @@ static bool io_kill_timeout(struct io_kiocb *req, int status) if (hrtimer_try_to_cancel(&io->timer) != -1) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); - if (status) - req_set_fail(req); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); - list_del_init(&timeout->list); - io_req_queue_tw_complete(req, status); - return true; + list_move_tail(&timeout->list, list); } - return false; } __cold void io_flush_timeouts(struct io_ring_ctx *ctx) { - u32 seq; struct io_timeout *timeout, *tmp; + LIST_HEAD(list); + u32 seq; raw_spin_lock_irq(&ctx->timeout_lock); seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); @@ -131,10 +147,11 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) if (events_got < events_needed) break; - io_kill_timeout(req, 0); + io_kill_timeout(req, &list); } ctx->cq_last_tm_flush = seq; raw_spin_unlock_irq(&ctx->timeout_lock); + io_flush_killed_timeouts(&list, 0); } static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) @@ -661,7 +678,7 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx bool cancel_all) { struct io_timeout *timeout, *tmp; - int canceled = 0; + LIST_HEAD(list); /* * completion_lock is needed for io_match_task(). Take it before @@ -672,11 +689,11 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); - if (io_match_task(req, tctx, cancel_all) && - io_kill_timeout(req, -ECANCELED)) - canceled++; + if (io_match_task(req, tctx, cancel_all)) + io_kill_timeout(req, &list); } raw_spin_unlock_irq(&ctx->timeout_lock); spin_unlock(&ctx->completion_lock); - return canceled != 0; + + return io_flush_killed_timeouts(&list, -ECANCELED); } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index af842e9b4eb9..ce7726a04883 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -16,26 +16,35 @@ #include "rsrc.h" #include "uring_cmd.h" -static struct uring_cache *io_uring_async_get(struct io_kiocb *req) +static struct io_uring_cmd_data *io_uring_async_get(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct uring_cache *cache; + struct io_uring_cmd_data *cache; cache = io_alloc_cache_get(&ctx->uring_cache); if (cache) { + cache->op_data = NULL; req->flags |= REQ_F_ASYNC_DATA; req->async_data = cache; return cache; } - if (!io_alloc_async_data(req)) - return req->async_data; + if (!io_alloc_async_data(req)) { + cache = req->async_data; + cache->op_data = NULL; + return cache; + } return NULL; } static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - struct uring_cache *cache = req->async_data; + struct io_uring_cmd_data *cache = req->async_data; + + if (cache->op_data) { + kfree(cache->op_data); + cache->op_data = NULL; + } if (issue_flags & IO_URING_F_UNLOCKED) return; @@ -183,7 +192,7 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - struct uring_cache *cache; + struct io_uring_cmd_data *cache; cache = io_uring_async_get(req); if (unlikely(!cache)) @@ -260,7 +269,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) ret = file->f_op->uring_cmd(ioucmd, issue_flags); if (ret == -EAGAIN) { - struct uring_cache *cache = req->async_data; + struct io_uring_cmd_data *cache = req->async_data; if (ioucmd->sqe != (void *) cache) memcpy(cache, ioucmd->sqe, uring_sqe_size(req->ctx)); diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index 7dba0f1efc58..f6837ee0955b 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -1,9 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -struct uring_cache { - struct io_uring_sqe sqes[2]; -}; - int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 383fd43ac612..7e1340da5aca 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -89,6 +89,7 @@ find $cpio_dir -type f -print0 | # Create archive and try to normalize metadata for reproducibility. tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ + --exclude=".__afs*" --exclude=".nfs*" \ --owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \ -I $XZ -cf $tarfile -C $cpio_dir/ . > /dev/null diff --git a/kernel/kcov.c b/kernel/kcov.c index 28a6be6e64fd..187ba1b80bda 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -166,7 +166,7 @@ static void kcov_remote_area_put(struct kcov_remote_area *area, * Unlike in_serving_softirq(), this function returns false when called during * a hardirq or an NMI that happened in the softirq context. */ -static inline bool in_softirq_really(void) +static __always_inline bool in_softirq_really(void) { return in_serving_softirq() && !in_hardirq() && !in_nmi(); } diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7fff1d045477..19d2699cf638 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4763,7 +4763,7 @@ static void scx_ops_bypass(bool bypass) * sees scx_rq_bypassing() before moving tasks to SCX. */ if (!scx_enabled()) { - rq_unlock_irqrestore(rq, &rf); + rq_unlock(rq, &rf); continue; } @@ -7013,7 +7013,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, return -ENOENT; INIT_LIST_HEAD(&kit->cursor.node); - kit->cursor.flags |= SCX_DSQ_LNODE_ITER_CURSOR | flags; + kit->cursor.flags = SCX_DSQ_LNODE_ITER_CURSOR | flags; kit->cursor.priv = READ_ONCE(kit->dsq->seq); return 0; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8b07576814a5..f7d8fc204579 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3680,23 +3680,27 @@ void workqueue_softirq_dead(unsigned int cpu) * check_flush_dependency - check for flush dependency sanity * @target_wq: workqueue being flushed * @target_work: work item being flushed (NULL for workqueue flushes) + * @from_cancel: are we called from the work cancel path * * %current is trying to flush the whole @target_wq or @target_work on it. - * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not - * reclaiming memory or running on a workqueue which doesn't have - * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to - * a deadlock. + * If this is not the cancel path (which implies work being flushed is either + * already running, or will not be at all), check if @target_wq doesn't have + * %WQ_MEM_RECLAIM and verify that %current is not reclaiming memory or running + * on a workqueue which doesn't have %WQ_MEM_RECLAIM as that can break forward- + * progress guarantee leading to a deadlock. */ static void check_flush_dependency(struct workqueue_struct *target_wq, - struct work_struct *target_work) + struct work_struct *target_work, + bool from_cancel) { - work_func_t target_func = target_work ? target_work->func : NULL; + work_func_t target_func; struct worker *worker; - if (target_wq->flags & WQ_MEM_RECLAIM) + if (from_cancel || target_wq->flags & WQ_MEM_RECLAIM) return; worker = current_wq_worker(); + target_func = target_work ? target_work->func : NULL; WARN_ONCE(current->flags & PF_MEMALLOC, "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps", @@ -3980,7 +3984,7 @@ void __flush_workqueue(struct workqueue_struct *wq) list_add_tail(&this_flusher.list, &wq->flusher_overflow); } - check_flush_dependency(wq, NULL); + check_flush_dependency(wq, NULL, false); mutex_unlock(&wq->mutex); @@ -4155,7 +4159,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, } wq = pwq->wq; - check_flush_dependency(wq, work); + check_flush_dependency(wq, work, from_cancel); insert_wq_barrier(pwq, barr, work, worker); raw_spin_unlock_irq(&pool->lock); @@ -5641,6 +5645,7 @@ static void wq_adjust_max_active(struct workqueue_struct *wq) } while (activated); } +__printf(1, 0) static struct workqueue_struct *__alloc_workqueue(const char *fmt, unsigned int flags, int max_active, va_list args) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index d0ae808f3a14..047397136f15 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4354,6 +4354,7 @@ int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, ret = 1; } if (ret < 0 && range_lo > min) { + mas_reset(mas); ret = mas_empty_area(mas, min, range_hi, 1); if (ret == 0) ret = 1; diff --git a/mm/damon/core.c b/mm/damon/core.c index 8b8e2933dcd4..0776452a1abb 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -868,6 +868,11 @@ static int damon_commit_schemes(struct damon_ctx *dst, struct damon_ctx *src) NUMA_NO_NODE); if (!new_scheme) return -ENOMEM; + err = damos_commit(new_scheme, src_scheme); + if (err) { + damon_destroy_scheme(new_scheme); + return err; + } damon_add_scheme(dst, new_scheme); } return 0; @@ -961,8 +966,11 @@ static int damon_commit_targets( return -ENOMEM; err = damon_commit_target(new_target, false, src_target, damon_target_has_pid(src)); - if (err) + if (err) { + damon_destroy_target(new_target); return err; + } + damon_add_target(dst, new_target); } return 0; } diff --git a/mm/filemap.c b/mm/filemap.c index f61cf51c2238..33b60d448fca 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -124,15 +124,6 @@ * ->private_lock (zap_pte_range->block_dirty_folio) */ -static void mapping_set_update(struct xa_state *xas, - struct address_space *mapping) -{ - if (dax_mapping(mapping) || shmem_mapping(mapping)) - return; - xas_set_update(xas, workingset_update_node); - xas_set_lru(xas, &shadow_nodes); -} - static void page_cache_delete(struct address_space *mapping, struct folio *folio, void *shadow) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cec4b121193f..c498874a7170 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7211,7 +7211,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, spte = hugetlb_walk(svma, saddr, vma_mmu_pagesize(svma)); if (spte) { - get_page(virt_to_page(spte)); + ptdesc_pmd_pts_inc(virt_to_ptdesc(spte)); break; } } @@ -7226,7 +7226,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, (pmd_t *)((unsigned long)spte & PAGE_MASK)); mm_inc_nr_pmds(mm); } else { - put_page(virt_to_page(spte)); + ptdesc_pmd_pts_dec(virt_to_ptdesc(spte)); } spin_unlock(&mm->page_table_lock); out: @@ -7238,10 +7238,6 @@ out: /* * unmap huge page backed by shared pte. * - * Hugetlb pte page is ref counted at the time of mapping. If pte is shared - * indicated by page_count > 1, unmap is achieved by clearing pud and - * decrementing the ref count. If count == 1, the pte page is not shared. - * * Called with page table lock held. * * returns: 1 successfully unmapped a shared pte page @@ -7250,18 +7246,20 @@ out: int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { + unsigned long sz = huge_page_size(hstate_vma(vma)); pgd_t *pgd = pgd_offset(mm, addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); i_mmap_assert_write_locked(vma->vm_file->f_mapping); hugetlb_vma_assert_locked(vma); - BUG_ON(page_count(virt_to_page(ptep)) == 0); - if (page_count(virt_to_page(ptep)) == 1) + if (sz != PMD_SIZE) + return 0; + if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep))) return 0; pud_clear(pud); - put_page(virt_to_page(ptep)); + ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep)); mm_dec_nr_pmds(mm); return 1; } diff --git a/mm/internal.h b/mm/internal.h index 3bd08bafad04..9826f7dce607 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1504,6 +1504,12 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, /* Only track the nodes of mappings with shadow entries */ void workingset_update_node(struct xa_node *node); extern struct list_lru shadow_nodes; +#define mapping_set_update(xas, mapping) do { \ + if (!dax_mapping(mapping) && !shmem_mapping(mapping)) { \ + xas_set_update(xas, workingset_update_node); \ + xas_set_lru(xas, &shadow_nodes); \ + } \ +} while (0) /* mremap.c */ unsigned long move_page_tables(struct vm_area_struct *vma, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6f8d46d107b4..653dbb1ff05c 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -19,6 +19,7 @@ #include <linux/rcupdate_wait.h> #include <linux/swapops.h> #include <linux/shmem_fs.h> +#include <linux/dax.h> #include <linux/ksm.h> #include <asm/tlb.h> @@ -1837,6 +1838,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, if (result != SCAN_SUCCEED) goto out; + mapping_set_update(&xas, mapping); + __folio_set_locked(new_folio); if (is_shmem) __folio_set_swapbacked(new_folio); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 2a945c07ae99..737af23f4f4e 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -373,7 +373,7 @@ static void print_unreferenced(struct seq_file *seq, for (i = 0; i < nr_entries; i++) { void *ptr = (void *)entries[i]; - warn_or_seq_printf(seq, " [<%pK>] %pS\n", ptr, ptr); + warn_or_seq_printf(seq, " %pS\n", ptr); } } diff --git a/mm/list_lru.c b/mm/list_lru.c index f93ada6a207b..7d69434c70e0 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -77,7 +77,6 @@ again: spin_lock(&l->lock); nr_items = READ_ONCE(l->nr_items); if (likely(nr_items != LONG_MIN)) { - WARN_ON(nr_items < 0); rcu_read_unlock(); return l; } @@ -450,6 +449,7 @@ static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid, list_splice_init(&src->list, &dst->list); if (src->nr_items) { + WARN_ON(src->nr_items < 0); dst->nr_items += src->nr_items; set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); } diff --git a/mm/memfd.c b/mm/memfd.c index c17c3ea701a1..35a370d75c9a 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -170,7 +170,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) return error; } -static unsigned int *memfd_file_seals_ptr(struct file *file) +unsigned int *memfd_file_seals_ptr(struct file *file) { if (shmem_file(file)) return &SHMEM_I(file_inode(file))->seals; diff --git a/mm/mmap.c b/mm/mmap.c index d32b7e701058..aec208f90337 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -47,6 +47,7 @@ #include <linux/oom.h> #include <linux/sched/mm.h> #include <linux/ksm.h> +#include <linux/memfd.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> @@ -368,6 +369,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (file) { struct inode *inode = file_inode(file); + unsigned int seals = memfd_file_seals(file); unsigned long flags_mask; if (!file_mmap_ok(file, inode, pgoff, len)) @@ -408,6 +410,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); + else if (is_readonly_sealed(seals, vm_flags)) + vm_flags &= ~VM_MAYWRITE; fallthrough; case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) @@ -888,7 +892,7 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (get_area) { addr = get_area(file, addr, len, pgoff, flags); - } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && !file && !addr /* no hint */ && IS_ALIGNED(len, PMD_SIZE)) { /* Ensures that larger anonymous mappings are THP aligned. */ diff --git a/mm/readahead.c b/mm/readahead.c index ea650b8b02fb..e151f4b13ca4 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -646,7 +646,11 @@ void page_cache_async_ra(struct readahead_control *ractl, 1UL << order); if (index == expected) { ra->start += ra->size; - ra->size = get_next_ra_size(ra, max_pages); + /* + * In the case of MADV_HUGEPAGE, the actual size might exceed + * the readahead window. + */ + ra->size = max(ra->size, get_next_ra_size(ra, max_pages)); ra->async_size = ra->size; goto readit; } diff --git a/mm/shmem.c b/mm/shmem.c index f6fb053ac50d..ac58d4fb2e6f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1535,7 +1535,7 @@ try_split: !shmem_falloc->waitq && index >= shmem_falloc->start && index < shmem_falloc->next) - shmem_falloc->nr_unswapped++; + shmem_falloc->nr_unswapped += nr_pages; else shmem_falloc = NULL; spin_unlock(&inode->i_lock); @@ -1689,6 +1689,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); unsigned long vm_flags = vma ? vma->vm_flags : 0; + pgoff_t aligned_index; bool global_huge; loff_t i_size; int order; @@ -1723,9 +1724,9 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, /* Allow mTHP that will be fully within i_size. */ order = highest_order(within_size_orders); while (within_size_orders) { - index = round_up(index + 1, order); + aligned_index = round_up(index + 1, 1 << order); i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >> PAGE_SHIFT >= index) { + if (i_size >> PAGE_SHIFT >= aligned_index) { mask |= within_size_orders; break; } diff --git a/mm/util.c b/mm/util.c index c1c3b06ab4f9..60aa40f612b8 100644 --- a/mm/util.c +++ b/mm/util.c @@ -297,12 +297,7 @@ void *memdup_user_nul(const void __user *src, size_t len) { char *p; - /* - * Always use GFP_KERNEL, since copy_from_user() can sleep and - * cause pagefault, which makes it pointless to use GFP_NOFS - * or GFP_ATOMIC. - */ - p = kmalloc_track_caller(len + 1, GFP_KERNEL); + p = kmem_buckets_alloc_track_caller(user_buckets, len + 1, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); diff --git a/mm/vmscan.c b/mm/vmscan.c index 76378bc257e3..9a859b7d18d7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -374,7 +374,14 @@ unsigned long zone_reclaimable_pages(struct zone *zone) if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL)) nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); - + /* + * If there are no reclaimable file-backed or anonymous pages, + * ensure zones with sufficient free pages are not skipped. + * This prevents zones like DMA32 from being ignored in reclaim + * scenarios where they can still help alleviate memory pressure. + */ + if (nr == 0) + nr = zone_page_state_snapshot(zone, NR_FREE_PAGES); return nr; } diff --git a/mm/zswap.c b/mm/zswap.c index f6316b66fb23..5a27af8d86ea 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -880,6 +880,18 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) return 0; } +/* Prevent CPU hotplug from freeing up the per-CPU acomp_ctx resources */ +static struct crypto_acomp_ctx *acomp_ctx_get_cpu(struct crypto_acomp_ctx __percpu *acomp_ctx) +{ + cpus_read_lock(); + return raw_cpu_ptr(acomp_ctx); +} + +static void acomp_ctx_put_cpu(void) +{ + cpus_read_unlock(); +} + static bool zswap_compress(struct page *page, struct zswap_entry *entry, struct zswap_pool *pool) { @@ -893,8 +905,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, gfp_t gfp; u8 *dst; - acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); - + acomp_ctx = acomp_ctx_get_cpu(pool->acomp_ctx); mutex_lock(&acomp_ctx->mutex); dst = acomp_ctx->buffer; @@ -950,6 +961,7 @@ unlock: zswap_reject_alloc_fail++; mutex_unlock(&acomp_ctx->mutex); + acomp_ctx_put_cpu(); return comp_ret == 0 && alloc_ret == 0; } @@ -960,7 +972,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) struct crypto_acomp_ctx *acomp_ctx; u8 *src; - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + acomp_ctx = acomp_ctx_get_cpu(entry->pool->acomp_ctx); mutex_lock(&acomp_ctx->mutex); src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); @@ -990,6 +1002,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) if (src != acomp_ctx->buffer) zpool_unmap_handle(zpool, entry->handle); + acomp_ctx_put_cpu(); } /********************************* diff --git a/net/802/psnap.c b/net/802/psnap.c index fca9d454905f..389df460c8c4 100644 --- a/net/802/psnap.c +++ b/net/802/psnap.c @@ -55,11 +55,11 @@ static int snap_rcv(struct sk_buff *skb, struct net_device *dev, goto drop; rcu_read_lock(); - proto = find_snap_client(skb_transport_header(skb)); + proto = find_snap_client(skb->data); if (proto) { /* Pass the frame on. */ - skb->transport_header += 5; skb_pull_rcsum(skb, 5); + skb_reset_transport_header(skb); rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev); } rcu_read_unlock(); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index d6f9fae06a9d..aa6c714892ec 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -467,7 +467,7 @@ einval_put: goto out_put; } -static void ax25_fillin_cb_from_dev(ax25_cb *ax25, ax25_dev *ax25_dev) +static void ax25_fillin_cb_from_dev(ax25_cb *ax25, const ax25_dev *ax25_dev) { ax25->rtt = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T1]) / 2; ax25->t1 = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T1]); @@ -677,22 +677,22 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, break; } - rtnl_lock(); - dev = __dev_get_by_name(&init_net, devname); + rcu_read_lock(); + dev = dev_get_by_name_rcu(&init_net, devname); if (!dev) { - rtnl_unlock(); + rcu_read_unlock(); res = -ENODEV; break; } ax25->ax25_dev = ax25_dev_ax25dev(dev); if (!ax25->ax25_dev) { - rtnl_unlock(); + rcu_read_unlock(); res = -ENODEV; break; } ax25_fillin_cb(ax25, ax25->ax25_dev); - rtnl_unlock(); + rcu_read_unlock(); break; default: diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 9efd6690b344..3733c0254a50 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -90,7 +90,7 @@ void ax25_dev_device_up(struct net_device *dev) spin_lock_bh(&ax25_dev_lock); list_add(&ax25_dev->list, &ax25_dev_list); - dev->ax25_ptr = ax25_dev; + rcu_assign_pointer(dev->ax25_ptr, ax25_dev); spin_unlock_bh(&ax25_dev_lock); ax25_register_dev_sysctl(ax25_dev); @@ -125,7 +125,7 @@ void ax25_dev_device_down(struct net_device *dev) } } - dev->ax25_ptr = NULL; + RCU_INIT_POINTER(dev->ax25_ptr, NULL); spin_unlock_bh(&ax25_dev_lock); netdev_put(dev, &ax25_dev->dev_tracker); ax25_dev_put(ax25_dev); diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c index 36249776c021..215d4ccf12b9 100644 --- a/net/ax25/ax25_ip.c +++ b/net/ax25/ax25_ip.c @@ -122,6 +122,7 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb) if (dev == NULL) dev = skb->dev; + rcu_read_lock(); if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) { kfree_skb(skb); goto put; @@ -202,7 +203,7 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb) ax25_queue_xmit(skb, dev); put: - + rcu_read_unlock(); ax25_route_lock_unuse(); return NETDEV_TX_OK; } diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c index 3db76d2470e9..8bca2ace98e5 100644 --- a/net/ax25/ax25_out.c +++ b/net/ax25/ax25_out.c @@ -39,10 +39,14 @@ ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, const ax25_address *sr * specified. */ if (paclen == 0) { - if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) + rcu_read_lock(); + ax25_dev = ax25_dev_ax25dev(dev); + if (!ax25_dev) { + rcu_read_unlock(); return NULL; - + } paclen = ax25_dev->values[AX25_VALUES_PACLEN]; + rcu_read_unlock(); } /* @@ -53,13 +57,19 @@ ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, const ax25_address *sr return ax25; /* It already existed */ } - if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) + rcu_read_lock(); + ax25_dev = ax25_dev_ax25dev(dev); + if (!ax25_dev) { + rcu_read_unlock(); return NULL; + } - if ((ax25 = ax25_create_cb()) == NULL) + if ((ax25 = ax25_create_cb()) == NULL) { + rcu_read_unlock(); return NULL; - + } ax25_fillin_cb(ax25, ax25_dev); + rcu_read_unlock(); ax25->source_addr = *src; ax25->dest_addr = *dest; @@ -358,7 +368,9 @@ void ax25_queue_xmit(struct sk_buff *skb, struct net_device *dev) { unsigned char *ptr; + rcu_read_lock(); skb->protocol = ax25_type_trans(skb, ax25_fwd_dev(dev)); + rcu_read_unlock(); ptr = skb_push(skb, 1); *ptr = 0x00; /* KISS */ diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index b7c4d656a94b..69de75db0c9c 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -406,6 +406,7 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr) ax25_route_lock_unuse(); return -EHOSTUNREACH; } + rcu_read_lock(); if ((ax25->ax25_dev = ax25_dev_ax25dev(ax25_rt->dev)) == NULL) { err = -EHOSTUNREACH; goto put; @@ -442,6 +443,7 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr) } put: + rcu_read_unlock(); ax25_route_lock_unuse(); return err; } diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index c86f4e42e69c..7b2b04d6b856 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -1031,9 +1031,9 @@ static bool adv_use_rpa(struct hci_dev *hdev, uint32_t flags) static int hci_set_random_addr_sync(struct hci_dev *hdev, bdaddr_t *rpa) { - /* If we're advertising or initiating an LE connection we can't - * go ahead and change the random address at this time. This is - * because the eventual initiator address used for the + /* If a random_addr has been set we're advertising or initiating an LE + * connection we can't go ahead and change the random address at this + * time. This is because the eventual initiator address used for the * subsequently created connection will be undefined (some * controllers use the new address and others the one we had * when the operation started). @@ -1041,8 +1041,9 @@ static int hci_set_random_addr_sync(struct hci_dev *hdev, bdaddr_t *rpa) * In this kind of scenario skip the update and let the random * address be updated at the next cycle. */ - if (hci_dev_test_flag(hdev, HCI_LE_ADV) || - hci_lookup_le_connect(hdev)) { + if (bacmp(&hdev->random_addr, BDADDR_ANY) && + (hci_dev_test_flag(hdev, HCI_LE_ADV) || + hci_lookup_le_connect(hdev))) { bt_dev_dbg(hdev, "Deferring random address update"); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); return 0; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index b31192d473d0..de47ad999d7b 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -7655,6 +7655,24 @@ static void device_added(struct sock *sk, struct hci_dev *hdev, mgmt_event(MGMT_EV_DEVICE_ADDED, hdev, &ev, sizeof(ev), sk); } +static void add_device_complete(struct hci_dev *hdev, void *data, int err) +{ + struct mgmt_pending_cmd *cmd = data; + struct mgmt_cp_add_device *cp = cmd->param; + + if (!err) { + device_added(cmd->sk, hdev, &cp->addr.bdaddr, cp->addr.type, + cp->action); + device_flags_changed(NULL, hdev, &cp->addr.bdaddr, + cp->addr.type, hdev->conn_flags, + PTR_UINT(cmd->user_data)); + } + + mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_ADD_DEVICE, + mgmt_status(err), &cp->addr, sizeof(cp->addr)); + mgmt_pending_free(cmd); +} + static int add_device_sync(struct hci_dev *hdev, void *data) { return hci_update_passive_scan_sync(hdev); @@ -7663,6 +7681,7 @@ static int add_device_sync(struct hci_dev *hdev, void *data) static int add_device(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { + struct mgmt_pending_cmd *cmd; struct mgmt_cp_add_device *cp = data; u8 auto_conn, addr_type; struct hci_conn_params *params; @@ -7743,9 +7762,24 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, current_flags = params->flags; } - err = hci_cmd_sync_queue(hdev, add_device_sync, NULL, NULL); - if (err < 0) + cmd = mgmt_pending_new(sk, MGMT_OP_ADD_DEVICE, hdev, data, len); + if (!cmd) { + err = -ENOMEM; goto unlock; + } + + cmd->user_data = UINT_PTR(current_flags); + + err = hci_cmd_sync_queue(hdev, add_device_sync, cmd, + add_device_complete); + if (err < 0) { + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, + MGMT_STATUS_FAILED, &cp->addr, + sizeof(cp->addr)); + mgmt_pending_free(cmd); + } + + goto unlock; added: device_added(sk, hdev, &cp->addr.bdaddr, cp->addr.type, cp->action); diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index af80d599c337..21a5b5535ebc 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -201,14 +201,14 @@ static ssize_t address_show(struct device *tty_dev, struct device_attribute *attr, char *buf) { struct rfcomm_dev *dev = dev_get_drvdata(tty_dev); - return sprintf(buf, "%pMR\n", &dev->dst); + return sysfs_emit(buf, "%pMR\n", &dev->dst); } static ssize_t channel_show(struct device *tty_dev, struct device_attribute *attr, char *buf) { struct rfcomm_dev *dev = dev_get_drvdata(tty_dev); - return sprintf(buf, "%d\n", dev->channel); + return sysfs_emit(buf, "%d\n", dev->channel); } static DEVICE_ATTR_RO(address); diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c index c7869a286df4..115a23054a58 100644 --- a/net/bridge/br_arp_nd_proxy.c +++ b/net/bridge/br_arp_nd_proxy.c @@ -229,7 +229,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, #endif #if IS_ENABLED(CONFIG_IPV6) -struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *msg) +struct nd_msg *br_is_nd_neigh_msg(const struct sk_buff *skb, struct nd_msg *msg) { struct nd_msg *m; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 29d6ec45cf41..1054b8a88edc 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -2299,6 +2299,6 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, u16 vid, struct net_bridge_port *p); void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, u16 vid, struct net_bridge_port *p, struct nd_msg *msg); -struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *m); +struct nd_msg *br_is_nd_neigh_msg(const struct sk_buff *skb, struct nd_msg *m); bool br_is_neigh_suppress_enabled(const struct net_bridge_port *p, u16 vid); #endif diff --git a/net/core/dev.c b/net/core/dev.c index e7223972b9aa..fda4e1039bf0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -753,6 +753,36 @@ int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, } EXPORT_SYMBOL_GPL(dev_fill_forward_path); +/* must be called under rcu_read_lock(), as we dont take a reference */ +static struct napi_struct *napi_by_id(unsigned int napi_id) +{ + unsigned int hash = napi_id % HASH_SIZE(napi_hash); + struct napi_struct *napi; + + hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) + if (napi->napi_id == napi_id) + return napi; + + return NULL; +} + +/* must be called under rcu_read_lock(), as we dont take a reference */ +struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id) +{ + struct napi_struct *napi; + + napi = napi_by_id(napi_id); + if (!napi) + return NULL; + + if (WARN_ON_ONCE(!napi->dev)) + return NULL; + if (!net_eq(net, dev_net(napi->dev))) + return NULL; + + return napi; +} + /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace @@ -927,7 +957,6 @@ EXPORT_SYMBOL(netdev_get_by_index); * its reference counter increased so the caller must be careful * about locking. The caller must hold RCU lock. */ - struct net_device *dev_get_by_napi_id(unsigned int napi_id) { struct napi_struct *napi; @@ -941,7 +970,6 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id) return napi ? napi->dev : NULL; } -EXPORT_SYMBOL(dev_get_by_napi_id); static DEFINE_SEQLOCK(netdev_rename_lock); @@ -1769,14 +1797,19 @@ int register_netdevice_notifier(struct notifier_block *nb) /* Close race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); + + /* When RTNL is removed, we need protection for netdev_chain. */ rtnl_lock(); + err = raw_notifier_chain_register(&netdev_chain, nb); if (err) goto unlock; if (dev_boot_phase) goto unlock; for_each_net(net) { + __rtnl_net_lock(net); err = call_netdevice_register_net_notifiers(nb, net); + __rtnl_net_unlock(net); if (err) goto rollback; } @@ -1787,8 +1820,11 @@ unlock: return err; rollback: - for_each_net_continue_reverse(net) + for_each_net_continue_reverse(net) { + __rtnl_net_lock(net); call_netdevice_unregister_net_notifiers(nb, net); + __rtnl_net_unlock(net); + } raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; @@ -1821,8 +1857,11 @@ int unregister_netdevice_notifier(struct notifier_block *nb) if (err) goto unlock; - for_each_net(net) + for_each_net(net) { + __rtnl_net_lock(net); call_netdevice_unregister_net_notifiers(nb, net); + __rtnl_net_unlock(net); + } unlock: rtnl_unlock(); @@ -1886,9 +1925,10 @@ int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb) { int err; - rtnl_lock(); + rtnl_net_lock(net); err = __register_netdevice_notifier_net(net, nb, false); - rtnl_unlock(); + rtnl_net_unlock(net); + return err; } EXPORT_SYMBOL(register_netdevice_notifier_net); @@ -1914,9 +1954,10 @@ int unregister_netdevice_notifier_net(struct net *net, { int err; - rtnl_lock(); + rtnl_net_lock(net); err = __unregister_netdevice_notifier_net(net, nb); - rtnl_unlock(); + rtnl_net_unlock(net); + return err; } EXPORT_SYMBOL(unregister_netdevice_notifier_net); @@ -1933,15 +1974,17 @@ int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) { + struct net *net = dev_net(dev); int err; - rtnl_lock(); - err = __register_netdevice_notifier_net(dev_net(dev), nb, false); + rtnl_net_lock(net); + err = __register_netdevice_notifier_net(net, nb, false); if (!err) { nn->nb = nb; list_add(&nn->list, &dev->net_notifier_list); } - rtnl_unlock(); + rtnl_net_unlock(net); + return err; } EXPORT_SYMBOL(register_netdevice_notifier_dev_net); @@ -1950,12 +1993,14 @@ int unregister_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) { + struct net *net = dev_net(dev); int err; - rtnl_lock(); + rtnl_net_lock(net); list_del(&nn->list); - err = __unregister_netdevice_notifier_net(dev_net(dev), nb); - rtnl_unlock(); + err = __unregister_netdevice_notifier_net(net, nb); + rtnl_net_unlock(net); + return err; } EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net); @@ -3233,7 +3278,7 @@ void netif_device_attach(struct net_device *dev) if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && netif_running(dev)) { netif_tx_wake_all_queues(dev); - __netdev_watchdog_up(dev); + netdev_watchdog_up(dev); } } EXPORT_SYMBOL(netif_device_attach); @@ -5476,8 +5521,14 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, orig_dev = skb->dev; skb_reset_network_header(skb); +#if !defined(CONFIG_DEBUG_NET) + /* We plan to no longer reset the transport header here. + * Give some time to fuzzers and dev build to catch bugs + * in network stacks. + */ if (!skb_transport_header_was_set(skb)) skb_reset_transport_header(skb); +#endif skb_reset_mac_len(skb); pt_prev = NULL; @@ -6293,19 +6344,6 @@ bool napi_complete_done(struct napi_struct *n, int work_done) } EXPORT_SYMBOL(napi_complete_done); -/* must be called under rcu_read_lock(), as we dont take a reference */ -struct napi_struct *napi_by_id(unsigned int napi_id) -{ - unsigned int hash = napi_id % HASH_SIZE(napi_hash); - struct napi_struct *napi; - - hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) - if (napi->napi_id == napi_id) - return napi; - - return NULL; -} - static void skb_defer_free_flush(struct softnet_data *sd) { struct sk_buff *skb, *next; @@ -6713,13 +6751,14 @@ static void napi_restore_config(struct napi_struct *n) n->gro_flush_timeout = n->config->gro_flush_timeout; n->irq_suspend_timeout = n->config->irq_suspend_timeout; /* a NAPI ID might be stored in the config, if so use it. if not, use - * napi_hash_add to generate one for us. It will be saved to the config - * in napi_disable. + * napi_hash_add to generate one for us. */ - if (n->config->napi_id) + if (n->config->napi_id) { napi_hash_add_with_id(n, n->config->napi_id); - else + } else { napi_hash_add(n); + n->config->napi_id = n->napi_id; + } } static void napi_save_config(struct napi_struct *n) @@ -6727,10 +6766,39 @@ static void napi_save_config(struct napi_struct *n) n->config->defer_hard_irqs = n->defer_hard_irqs; n->config->gro_flush_timeout = n->gro_flush_timeout; n->config->irq_suspend_timeout = n->irq_suspend_timeout; - n->config->napi_id = n->napi_id; napi_hash_del(n); } +/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will + * inherit an existing ID try to insert it at the right position. + */ +static void +netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) +{ + unsigned int new_id, pos_id; + struct list_head *higher; + struct napi_struct *pos; + + new_id = UINT_MAX; + if (napi->config && napi->config->napi_id) + new_id = napi->config->napi_id; + + higher = &dev->napi_list; + list_for_each_entry(pos, &dev->napi_list, dev_list) { + if (pos->napi_id >= MIN_NAPI_ID) + pos_id = pos->napi_id; + else if (pos->config) + pos_id = pos->config->napi_id; + else + pos_id = UINT_MAX; + + if (pos_id <= new_id) + break; + higher = &pos->dev_list; + } + list_add_rcu(&napi->dev_list, higher); /* adds after higher */ +} + void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { @@ -6757,7 +6825,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, napi->list_owner = -1; set_bit(NAPI_STATE_SCHED, &napi->state); set_bit(NAPI_STATE_NPSVC, &napi->state); - list_add_rcu(&napi->dev_list, &dev->napi_list); + netif_napi_dev_list_add(dev, napi); /* default settings from sysfs are applied to all NAPIs. any per-NAPI * configuration will be loaded in napi_enable @@ -10668,26 +10736,20 @@ err_free_name: EXPORT_SYMBOL(register_netdevice); /* Initialize the core of a dummy net device. - * This is useful if you are calling this function after alloc_netdev(), - * since it does not memset the net_device fields. + * The setup steps dummy netdevs need which normal netdevs get by going + * through register_netdevice(). */ -static void init_dummy_netdev_core(struct net_device *dev) +static void init_dummy_netdev(struct net_device *dev) { /* make sure we BUG if trying to hit standard * register/unregister code path */ dev->reg_state = NETREG_DUMMY; - /* NAPI wants this */ - INIT_LIST_HEAD(&dev->napi_list); - /* a dummy interface is started by default */ set_bit(__LINK_STATE_PRESENT, &dev->state); set_bit(__LINK_STATE_START, &dev->state); - /* napi_busy_loop stats accounting wants this */ - dev_net_set(dev, &init_net); - /* Note : We dont allocate pcpu_refcnt for dummy devices, * because users of this 'device' dont need to change * its refcount. @@ -10695,28 +10757,6 @@ static void init_dummy_netdev_core(struct net_device *dev) } /** - * init_dummy_netdev - init a dummy network device for NAPI - * @dev: device to init - * - * This takes a network device structure and initializes the minimum - * amount of fields so it can be used to schedule NAPI polls without - * registering a full blown interface. This is to be used by drivers - * that need to tie several hardware interfaces to a single NAPI - * poll scheduler due to HW limitations. - */ -void init_dummy_netdev(struct net_device *dev) -{ - /* Clear everything. Note we don't initialize spinlocks - * as they aren't supposed to be taken by any of the - * NAPI code and this dummy netdev is supposed to be - * only ever used for NAPI polls - */ - memset(dev, 0, sizeof(struct net_device)); - init_dummy_netdev_core(dev); -} -EXPORT_SYMBOL_GPL(init_dummy_netdev); - -/** * register_netdev - register a network device * @dev: device to register * @@ -10731,12 +10771,16 @@ EXPORT_SYMBOL_GPL(init_dummy_netdev); */ int register_netdev(struct net_device *dev) { + struct net *net = dev_net(dev); int err; - if (rtnl_lock_killable()) + if (rtnl_net_lock_killable(net)) return -EINTR; + err = register_netdevice(dev); - rtnl_unlock(); + + rtnl_net_unlock(net); + return err; } EXPORT_SYMBOL(register_netdev); @@ -11390,7 +11434,7 @@ EXPORT_SYMBOL(free_netdev); struct net_device *alloc_netdev_dummy(int sizeof_priv) { return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN, - init_dummy_netdev_core); + init_dummy_netdev); } EXPORT_SYMBOL_GPL(alloc_netdev_dummy); @@ -11606,9 +11650,11 @@ EXPORT_SYMBOL(unregister_netdevice_many); */ void unregister_netdev(struct net_device *dev) { - rtnl_lock(); + struct net *net = dev_net(dev); + + rtnl_net_lock(net); unregister_netdevice(dev); - rtnl_unlock(); + rtnl_net_unlock(net); } EXPORT_SYMBOL(unregister_netdev); diff --git a/net/core/dev.h b/net/core/dev.h index aa91eed55a40..d8966847794c 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -22,6 +22,9 @@ struct sd_flow_limit { extern int netdev_flow_limit_table_len; +struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id); +struct net_device *dev_get_by_napi_id(unsigned int napi_id); + #ifdef CONFIG_PROC_FS int __init dev_proc_init(void); #else @@ -269,7 +272,6 @@ void xdp_do_check_flushed(struct napi_struct *napi); static inline void xdp_do_check_flushed(struct napi_struct *napi) { } #endif -struct napi_struct *napi_by_id(unsigned int napi_id); void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); #define XMIT_RECURSION_LIMIT 8 diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 1b4d39e38084..cb04ef2b9807 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -42,14 +42,18 @@ static unsigned int default_operstate(const struct net_device *dev) * first check whether lower is indeed the source of its down state. */ if (!netif_carrier_ok(dev)) { - int iflink = dev_get_iflink(dev); struct net_device *peer; + int iflink; /* If called from netdev_run_todo()/linkwatch_sync_dev(), * dev_net(dev) can be already freed, and RTNL is not held. */ - if (dev->reg_state == NETREG_UNREGISTERED || - iflink == dev->ifindex) + if (dev->reg_state <= NETREG_REGISTERED) + iflink = dev_get_iflink(dev); + else + iflink = dev->ifindex; + + if (iflink == dev->ifindex) return IF_OPER_DOWN; ASSERT_RTNL(); diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index b0772d135efb..c59619a2ec23 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -167,8 +167,6 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, void *hdr; pid_t pid; - if (WARN_ON_ONCE(!napi->dev)) - return -EINVAL; if (!(napi->dev->flags & IFF_UP)) return 0; @@ -176,8 +174,7 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, if (!hdr) return -EMSGSIZE; - if (napi->napi_id >= MIN_NAPI_ID && - nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) + if (nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) goto nla_put_failure; if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex)) @@ -235,7 +232,7 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) rtnl_lock(); rcu_read_lock(); - napi = napi_by_id(napi_id); + napi = netdev_napi_by_id(genl_info_net(info), napi_id); if (napi) { err = netdev_nl_napi_fill_one(rsp, napi, info); } else { @@ -266,12 +263,21 @@ netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp, struct netdev_nl_dump_ctx *ctx) { struct napi_struct *napi; + unsigned int prev_id; int err = 0; if (!(netdev->flags & IFF_UP)) return err; + prev_id = UINT_MAX; list_for_each_entry(napi, &netdev->napi_list, dev_list) { + if (napi->napi_id < MIN_NAPI_ID) + continue; + + /* Dump continuation below depends on the list being sorted */ + WARN_ON_ONCE(napi->napi_id >= prev_id); + prev_id = napi->napi_id; + if (ctx->napi_id && napi->napi_id >= ctx->napi_id) continue; @@ -354,7 +360,7 @@ int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info) rtnl_lock(); rcu_read_lock(); - napi = napi_by_id(napi_id); + napi = netdev_napi_by_id(genl_info_net(info), napi_id); if (napi) { err = netdev_nl_napi_set_config(napi, info); } else { diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index e217a5838c87..db82786fa0c4 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -79,3 +79,4 @@ err_free_new_mem: return err; } +EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL"); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 6b745096809d..1f4d4b5570ab 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -84,7 +84,6 @@ int rtnl_lock_killable(void) { return mutex_lock_killable(&rtnl_mutex); } -EXPORT_SYMBOL(rtnl_lock_killable); static struct sk_buff *defer_kfree_skb_list; void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) @@ -221,6 +220,16 @@ int rtnl_net_trylock(struct net *net) } EXPORT_SYMBOL(rtnl_net_trylock); +int rtnl_net_lock_killable(struct net *net) +{ + int ret = rtnl_lock_killable(); + + if (!ret) + __rtnl_net_lock(net); + + return ret; +} + static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) { if (net_eq(net_a, net_b)) diff --git a/net/dsa/user.c b/net/dsa/user.c index 4a8de48a6f24..c74f2b2b92de 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -1251,7 +1251,6 @@ static int dsa_user_get_eee(struct net_device *dev, struct ethtool_keee *e) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; - int ret; /* Check whether the switch supports EEE */ if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index)) @@ -1261,13 +1260,6 @@ static int dsa_user_get_eee(struct net_device *dev, struct ethtool_keee *e) if (!dev->phydev) return -ENODEV; - if (!ds->ops->get_mac_eee) - return -EOPNOTSUPP; - - ret = ds->ops->get_mac_eee(ds, dp->index, e); - if (ret) - return ret; - return phylink_ethtool_get_eee(dp->pl, e); } diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 2607aea1fbfb..2bd77c94f9f1 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -869,6 +869,7 @@ int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info) { struct hwtstamp_provider *hwprov; + int err = 0; rcu_read_lock(); hwprov = rcu_dereference(dev->hwprov); @@ -876,7 +877,6 @@ int __ethtool_get_ts_info(struct net_device *dev, if (!hwprov) { const struct ethtool_ops *ops = dev->ethtool_ops; struct phy_device *phydev = dev->phydev; - int err = 0; ethtool_init_tsinfo(info); if (phy_is_default_hwtstamp(phydev) && @@ -892,8 +892,9 @@ int __ethtool_get_ts_info(struct net_device *dev, return err; } + err = ethtool_get_ts_info_by_phc(dev, info, &hwprov->desc); rcu_read_unlock(); - return ethtool_get_ts_info_by_phc(dev, info, &hwprov->desc); + return err; } bool net_support_hwtstamp_qualifier(struct net_device *dev, diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c index 34d76e87847d..af19e1bed303 100644 --- a/net/ethtool/linkstate.c +++ b/net/ethtool/linkstate.c @@ -3,6 +3,7 @@ #include "netlink.h" #include "common.h" #include <linux/phy.h> +#include <linux/phylib_stubs.h> struct linkstate_req_info { struct ethnl_req_info base; @@ -26,9 +27,8 @@ const struct nla_policy ethnl_linkstate_get_policy[] = { NLA_POLICY_NESTED(ethnl_header_policy_stats), }; -static int linkstate_get_sqi(struct net_device *dev) +static int linkstate_get_sqi(struct phy_device *phydev) { - struct phy_device *phydev = dev->phydev; int ret; if (!phydev) @@ -46,9 +46,8 @@ static int linkstate_get_sqi(struct net_device *dev) return ret; } -static int linkstate_get_sqi_max(struct net_device *dev) +static int linkstate_get_sqi_max(struct phy_device *phydev) { - struct phy_device *phydev = dev->phydev; int ret; if (!phydev) @@ -100,19 +99,28 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, { struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base); struct net_device *dev = reply_base->dev; + struct nlattr **tb = info->attrs; + struct phy_device *phydev; int ret; + phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_LINKSTATE_HEADER], + info->extack); + if (IS_ERR(phydev)) { + ret = PTR_ERR(phydev); + goto out; + } + ret = ethnl_ops_begin(dev); if (ret < 0) return ret; data->link = __ethtool_get_link(dev); - ret = linkstate_get_sqi(dev); + ret = linkstate_get_sqi(phydev); if (linkstate_sqi_critical_error(ret)) goto out; data->sqi = ret; - ret = linkstate_get_sqi_max(dev); + ret = linkstate_get_sqi_max(phydev); if (linkstate_sqi_critical_error(ret)) goto out; data->sqi_max = ret; @@ -127,9 +135,9 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, sizeof(data->link_stats) / 8); if (req_base->flags & ETHTOOL_FLAG_STATS) { - if (dev->phydev) - data->link_stats.link_down_events = - READ_ONCE(dev->phydev->link_down_events); + if (phydev) + phy_ethtool_get_link_ext_stats(phydev, + &data->link_stats); if (dev->ethtool_ops->get_link_ext_stats) dev->ethtool_ops->get_link_ext_stats(dev, diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 0a09298fff92..1ce0a3de1430 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -511,5 +511,6 @@ extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING extern const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN]; extern const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN]; +extern const char stats_phy_names[__ETHTOOL_A_STATS_PHY_CNT][ETH_GSTRING_LEN]; #endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c index 912f0c4fff2f..038a2558f052 100644 --- a/net/ethtool/stats.c +++ b/net/ethtool/stats.c @@ -1,5 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only +#include <linux/phy.h> +#include <linux/phylib_stubs.h> + #include "netlink.h" #include "common.h" #include "bitset.h" @@ -20,6 +23,7 @@ struct stats_reply_data { struct ethtool_eth_mac_stats mac_stats; struct ethtool_eth_ctrl_stats ctrl_stats; struct ethtool_rmon_stats rmon_stats; + struct ethtool_phy_stats phydev_stats; ); const struct ethtool_rmon_hist_range *rmon_ranges; }; @@ -32,6 +36,7 @@ const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_STATS_ETH_MAC] = "eth-mac", [ETHTOOL_STATS_ETH_CTRL] = "eth-ctrl", [ETHTOOL_STATS_RMON] = "rmon", + [ETHTOOL_STATS_PHY] = "phydev", }; const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = { @@ -76,6 +81,15 @@ const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_A_STATS_RMON_JABBER] = "etherStatsJabbers", }; +const char stats_phy_names[__ETHTOOL_A_STATS_PHY_CNT][ETH_GSTRING_LEN] = { + [ETHTOOL_A_STATS_PHY_RX_PKTS] = "RxFrames", + [ETHTOOL_A_STATS_PHY_RX_BYTES] = "RxOctets", + [ETHTOOL_A_STATS_PHY_RX_ERRORS] = "RxErrors", + [ETHTOOL_A_STATS_PHY_TX_PKTS] = "TxFrames", + [ETHTOOL_A_STATS_PHY_TX_BYTES] = "TxOctets", + [ETHTOOL_A_STATS_PHY_TX_ERRORS] = "TxErrors", +}; + const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_SRC + 1] = { [ETHTOOL_A_STATS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), @@ -120,8 +134,15 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base, struct stats_reply_data *data = STATS_REPDATA(reply_base); enum ethtool_mac_stats_src src = req_info->src; struct net_device *dev = reply_base->dev; + struct nlattr **tb = info->attrs; + struct phy_device *phydev; int ret; + phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_STATS_HEADER], + info->extack); + if (IS_ERR(phydev)) + return PTR_ERR(phydev); + ret = ethnl_ops_begin(dev); if (ret < 0) return ret; @@ -145,6 +166,14 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base, data->ctrl_stats.src = src; data->rmon_stats.src = src; + if ((test_bit(ETHTOOL_STATS_PHY, req_info->stat_mask) || + test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) && + src == ETHTOOL_MAC_STATS_SRC_AGGREGATE) { + if (phydev) + phy_ethtool_get_phy_stats(phydev, &data->phy_stats, + &data->phydev_stats); + } + if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) && dev->ethtool_ops->get_eth_phy_stats) dev->ethtool_ops->get_eth_phy_stats(dev, &data->phy_stats); @@ -194,6 +223,10 @@ static int stats_reply_size(const struct ethnl_req_info *req_base, nla_total_size(4)) * /* _A_STATS_GRP_HIST_BKT_HI */ ETHTOOL_RMON_HIST_MAX * 2; } + if (test_bit(ETHTOOL_STATS_PHY, req_info->stat_mask)) { + n_stats += sizeof(struct ethtool_phy_stats) / sizeof(u64); + n_grps++; + } len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */ nla_total_size(4) + /* _A_STATS_GRP_ID */ @@ -247,6 +280,25 @@ static int stats_put_phy_stats(struct sk_buff *skb, return 0; } +static int stats_put_phydev_stats(struct sk_buff *skb, + const struct stats_reply_data *data) +{ + if (stat_put(skb, ETHTOOL_A_STATS_PHY_RX_PKTS, + data->phydev_stats.rx_packets) || + stat_put(skb, ETHTOOL_A_STATS_PHY_RX_BYTES, + data->phydev_stats.rx_bytes) || + stat_put(skb, ETHTOOL_A_STATS_PHY_RX_ERRORS, + data->phydev_stats.rx_errors) || + stat_put(skb, ETHTOOL_A_STATS_PHY_TX_PKTS, + data->phydev_stats.tx_packets) || + stat_put(skb, ETHTOOL_A_STATS_PHY_TX_BYTES, + data->phydev_stats.tx_bytes) || + stat_put(skb, ETHTOOL_A_STATS_PHY_TX_ERRORS, + data->phydev_stats.tx_errors)) + return -EMSGSIZE; + return 0; +} + static int stats_put_mac_stats(struct sk_buff *skb, const struct stats_reply_data *data) { @@ -423,6 +475,9 @@ static int stats_fill_reply(struct sk_buff *skb, if (!ret && test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask)) ret = stats_put_stats(skb, data, ETHTOOL_STATS_RMON, ETH_SS_STATS_RMON, stats_put_rmon_stats); + if (!ret && test_bit(ETHTOOL_STATS_PHY, req_info->stat_mask)) + ret = stats_put_stats(skb, data, ETHTOOL_STATS_PHY, + ETH_SS_STATS_PHY, stats_put_phydev_stats); return ret; } diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index b3382b3cf325..818cf01f0911 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -105,6 +105,11 @@ static const struct strset_info info_template[] = { .count = __ETHTOOL_A_STATS_RMON_CNT, .strings = stats_rmon_names, }, + [ETH_SS_STATS_PHY] = { + .per_dev = false, + .count = __ETHTOOL_A_STATS_PHY_CNT, + .strings = stats_phy_names, + }, }; struct strset_req_info { diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index fcfeb79bb040..7d7551e6f0b0 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -163,6 +163,7 @@ struct hsr_port { struct net_device *dev; struct hsr_priv *hsr; enum hsr_port_type type; + struct rcu_head rcu; }; struct hsr_frame_info; diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index 464f683e016d..2a802a5de2ac 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -204,7 +204,6 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, } list_add_tail_rcu(&port->port_list, &hsr->ports); - synchronize_rcu(); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); netdev_update_features(master->dev); @@ -235,7 +234,5 @@ void hsr_del_port(struct hsr_port *port) netdev_upper_dev_unlink(port->dev, master->dev); } - synchronize_rcu(); - - kfree(port); + kfree_rcu(port, rcu); } diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index f3281312eb5e..b0fbf804bbba 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -816,7 +816,8 @@ int esp_input_done2(struct sk_buff *skb, int err) } skb_pull_rcsum(skb, hlen); - if (x->props.mode == XFRM_MODE_TUNNEL) + if (x->props.mode == XFRM_MODE_TUNNEL || + x->props.mode == XFRM_MODE_IPTFS) skb_reset_transport_header(skb); else skb_set_transport_header(skb, -ihl); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e45222d5fc2e..cc2b5194a18d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -898,7 +898,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, sock_net_set(ctl_sk, net); if (sk) { ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_mark : sk->sk_mark; + inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); transmit_time = tcp_transmit_time(sk); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4da409bc4577..c3729382be3b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5240,9 +5240,9 @@ int inet6_fill_ifmcaddr(struct sk_buff *skb, return 0; } -static int inet6_fill_ifacaddr(struct sk_buff *skb, - const struct ifacaddr6 *ifaca, - struct inet6_fill_args *args) +int inet6_fill_ifacaddr(struct sk_buff *skb, + const struct ifacaddr6 *ifaca, + struct inet6_fill_args *args) { struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt); int ifindex = dev ? dev->ifindex : 1; diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 562cace50ca9..21e01695b48c 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -278,6 +278,37 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, return aca; } +static void inet6_ifacaddr_notify(struct net_device *dev, + const struct ifacaddr6 *ifaca, int event) +{ + struct inet6_fill_args fillargs = { + .event = event, + .netnsid = -1, + }; + struct net *net = dev_net(dev); + struct sk_buff *skb; + int err = -ENOMEM; + + skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + nla_total_size(sizeof(struct in6_addr)) + + nla_total_size(sizeof(struct ifa_cacheinfo)), + GFP_KERNEL); + if (!skb) + goto error; + + err = inet6_fill_ifacaddr(skb, ifaca, &fillargs); + if (err < 0) { + pr_err("Failed to fill in anycast addresses (err %d)\n", err); + nlmsg_free(skb); + goto error; + } + + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ACADDR, NULL, GFP_KERNEL); + return; +error: + rtnl_set_sk_err(net, RTNLGRP_IPV6_ACADDR, err); +} + /* * device anycast group inc (add if not found) */ @@ -333,6 +364,8 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) addrconf_join_solict(idev->dev, &aca->aca_addr); + inet6_ifacaddr_notify(idev->dev, aca, RTM_NEWANYCAST); + aca_put(aca); return 0; out: @@ -375,6 +408,8 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false); + inet6_ifacaddr_notify(idev->dev, aca, RTM_DELANYCAST); + aca_put(aca); return 0; } diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index b2400c226a32..5f3d0cc1555a 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -859,7 +859,8 @@ int esp6_input_done2(struct sk_buff *skb, int err) skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); skb_pull_rcsum(skb, hlen); - if (x->props.mode == XFRM_MODE_TUNNEL) + if (x->props.mode == XFRM_MODE_TUNNEL || + x->props.mode == XFRM_MODE_IPTFS) skb_reset_transport_header(skb); else skb_set_transport_header(skb, -hdr_len); diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index c0e2da5072be..9e4631fade90 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -684,6 +684,10 @@ void ieee802154_if_remove(struct ieee802154_sub_if_data *sdata) ASSERT_RTNL(); mutex_lock(&sdata->local->iflist_mtx); + if (list_empty(&sdata->local->interfaces)) { + mutex_unlock(&sdata->local->iflist_mtx); + return; + } list_del_rcu(&sdata->list); mutex_unlock(&sdata->local->iflist_mtx); diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 38d8121331d4..b0dd008e2114 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -102,16 +102,15 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) } #ifdef CONFIG_SYSCTL -static int mptcp_set_scheduler(const struct net *net, const char *name) +static int mptcp_set_scheduler(char *scheduler, const char *name) { - struct mptcp_pernet *pernet = mptcp_get_pernet(net); struct mptcp_sched_ops *sched; int ret = 0; rcu_read_lock(); sched = mptcp_sched_find(name); if (sched) - strscpy(pernet->scheduler, name, MPTCP_SCHED_NAME_MAX); + strscpy(scheduler, name, MPTCP_SCHED_NAME_MAX); else ret = -ENOENT; rcu_read_unlock(); @@ -122,7 +121,7 @@ static int mptcp_set_scheduler(const struct net *net, const char *name) static int proc_scheduler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - const struct net *net = current->nsproxy->net_ns; + char (*scheduler)[MPTCP_SCHED_NAME_MAX] = ctl->data; char val[MPTCP_SCHED_NAME_MAX]; struct ctl_table tbl = { .data = val, @@ -130,11 +129,11 @@ static int proc_scheduler(const struct ctl_table *ctl, int write, }; int ret; - strscpy(val, mptcp_get_scheduler(net), MPTCP_SCHED_NAME_MAX); + strscpy(val, *scheduler, MPTCP_SCHED_NAME_MAX); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) - ret = mptcp_set_scheduler(net, val); + ret = mptcp_set_scheduler(*scheduler, val); return ret; } @@ -161,7 +160,9 @@ static int proc_blackhole_detect_timeout(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct mptcp_pernet *pernet = mptcp_get_pernet(current->nsproxy->net_ns); + struct mptcp_pernet *pernet = container_of(table->data, + struct mptcp_pernet, + blackhole_timeout); int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -228,7 +229,7 @@ static struct ctl_table mptcp_sysctl_table[] = { { .procname = "available_schedulers", .maxlen = MPTCP_SCHED_BUF_MAX, - .mode = 0644, + .mode = 0444, .proc_handler = proc_available_schedulers, }, { diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 1b2e7cbb577f..c44c89ecaca6 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -47,7 +47,7 @@ static void __mptcp_destroy_sock(struct sock *sk); static void mptcp_check_send_data_fin(struct sock *sk); DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); -static struct net_device mptcp_napi_dev; +static struct net_device *mptcp_napi_dev; /* Returns end sequence number of the receiver's advertised window */ static u64 mptcp_wnd_end(const struct mptcp_sock *msk) @@ -4147,11 +4147,13 @@ void __init mptcp_proto_init(void) if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL)) panic("Failed to allocate MPTCP pcpu counter\n"); - init_dummy_netdev(&mptcp_napi_dev); + mptcp_napi_dev = alloc_netdev_dummy(0); + if (!mptcp_napi_dev) + panic("Failed to allocate MPTCP dummy netdev\n"); for_each_possible_cpu(cpu) { delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu); INIT_LIST_HEAD(&delegated->head); - netif_napi_add_tx(&mptcp_napi_dev, &delegated->napi, + netif_napi_add_tx(mptcp_napi_dev, &delegated->napi, mptcp_napi_poll); napi_enable(&delegated->napi); } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 9db3e2b0b1c3..456446d7af20 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2517,12 +2517,15 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) struct hlist_nulls_head *hash; unsigned int nr_slots, i; - if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) + if (*sizep > (INT_MAX / sizeof(struct hlist_nulls_head))) return NULL; BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); + if (nr_slots > (INT_MAX / sizeof(struct hlist_nulls_head))) + return NULL; + hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); if (hash && nulls) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f7ca7165e66e..83f3face8bb3 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8821,6 +8821,7 @@ static void nft_unregister_flowtable_hook(struct net *net, } static void __nft_unregister_flowtable_net_hooks(struct net *net, + struct nft_flowtable *flowtable, struct list_head *hook_list, bool release_netdev) { @@ -8828,6 +8829,8 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net, list_for_each_entry_safe(hook, next, hook_list, list) { nf_unregister_net_hook(net, &hook->ops); + flowtable->data.type->setup(&flowtable->data, hook->ops.dev, + FLOW_BLOCK_UNBIND); if (release_netdev) { list_del(&hook->list); kfree_rcu(hook, rcu); @@ -8836,9 +8839,10 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net, } static void nft_unregister_flowtable_net_hooks(struct net *net, + struct nft_flowtable *flowtable, struct list_head *hook_list) { - __nft_unregister_flowtable_net_hooks(net, hook_list, false); + __nft_unregister_flowtable_net_hooks(net, flowtable, hook_list, false); } static int nft_register_flowtable_net_hooks(struct net *net, @@ -9480,8 +9484,6 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) flowtable->data.type->free(&flowtable->data); list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { - flowtable->data.type->setup(&flowtable->data, hook->ops.dev, - FLOW_BLOCK_UNBIND); list_del_rcu(&hook->list); kfree_rcu(hook, rcu); } @@ -10869,6 +10871,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) &nft_trans_flowtable_hooks(trans), trans->msg_type); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans)); } else { list_del_rcu(&nft_trans_flowtable(trans)->list); @@ -10877,6 +10880,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) NULL, trans->msg_type); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable(trans)->hook_list); } break; @@ -11139,11 +11143,13 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_NEWFLOWTABLE: if (nft_trans_flowtable_update(trans)) { nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans)); } else { nft_use_dec_restore(&table->use); list_del_rcu(&nft_trans_flowtable(trans)->list); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable(trans)->hook_list); } break; @@ -11736,7 +11742,8 @@ static void __nft_release_hook(struct net *net, struct nft_table *table) list_for_each_entry(chain, &table->chains, list) __nf_tables_unregister_hook(net, table, chain, true); list_for_each_entry(flowtable, &table->flowtables, list) - __nft_unregister_flowtable_net_hooks(net, &flowtable->hook_list, + __nft_unregister_flowtable_net_hooks(net, flowtable, + &flowtable->hook_list, true); } diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c index 8a07b46cc8fb..3210cfc966ab 100644 --- a/net/netfilter/nft_xfrm.c +++ b/net/netfilter/nft_xfrm.c @@ -112,7 +112,8 @@ static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode) return true; } - return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL; + return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL || + mode == XFRM_MODE_IPTFS; } static void nft_xfrm_state_get_key(const struct nft_xfrm *priv, diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 351ac1747224..0581c53e6517 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -61,8 +61,10 @@ static atomic_t rds_tcp_unloading = ATOMIC_INIT(0); static struct kmem_cache *rds_tcp_conn_slab; -static int rds_tcp_skbuf_handler(const struct ctl_table *ctl, int write, - void *buffer, size_t *lenp, loff_t *fpos); +static int rds_tcp_sndbuf_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *fpos); +static int rds_tcp_rcvbuf_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *fpos); static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF; static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF; @@ -74,7 +76,7 @@ static struct ctl_table rds_tcp_sysctl_table[] = { /* data is per-net pointer */ .maxlen = sizeof(int), .mode = 0644, - .proc_handler = rds_tcp_skbuf_handler, + .proc_handler = rds_tcp_sndbuf_handler, .extra1 = &rds_tcp_min_sndbuf, }, #define RDS_TCP_RCVBUF 1 @@ -83,7 +85,7 @@ static struct ctl_table rds_tcp_sysctl_table[] = { /* data is per-net pointer */ .maxlen = sizeof(int), .mode = 0644, - .proc_handler = rds_tcp_skbuf_handler, + .proc_handler = rds_tcp_rcvbuf_handler, .extra1 = &rds_tcp_min_rcvbuf, }, }; @@ -682,10 +684,10 @@ static void rds_tcp_sysctl_reset(struct net *net) spin_unlock_irq(&rds_tcp_conn_lock); } -static int rds_tcp_skbuf_handler(const struct ctl_table *ctl, int write, +static int rds_tcp_skbuf_handler(struct rds_tcp_net *rtn, + const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *fpos) { - struct net *net = current->nsproxy->net_ns; int err; err = proc_dointvec_minmax(ctl, write, buffer, lenp, fpos); @@ -694,11 +696,34 @@ static int rds_tcp_skbuf_handler(const struct ctl_table *ctl, int write, *(int *)(ctl->extra1)); return err; } - if (write) + + if (write && rtn->rds_tcp_listen_sock && rtn->rds_tcp_listen_sock->sk) { + struct net *net = sock_net(rtn->rds_tcp_listen_sock->sk); + rds_tcp_sysctl_reset(net); + } + return 0; } +static int rds_tcp_sndbuf_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *fpos) +{ + struct rds_tcp_net *rtn = container_of(ctl->data, struct rds_tcp_net, + sndbuf_size); + + return rds_tcp_skbuf_handler(rtn, ctl, write, buffer, lenp, fpos); +} + +static int rds_tcp_rcvbuf_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *fpos) +{ + struct rds_tcp_net *rtn = container_of(ctl->data, struct rds_tcp_net, + rcvbuf_size); + + return rds_tcp_skbuf_handler(rtn, ctl, write, buffer, lenp, fpos); +} + static void rds_tcp_exit(void) { rds_tcp_set_unloading(); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 5502998aace7..5c2580a07530 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -356,7 +356,8 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { [TCA_FLOW_KEYS] = { .type = NLA_U32 }, [TCA_FLOW_MODE] = { .type = NLA_U32 }, [TCA_FLOW_BASECLASS] = { .type = NLA_U32 }, - [TCA_FLOW_RSHIFT] = { .type = NLA_U32 }, + [TCA_FLOW_RSHIFT] = NLA_POLICY_MAX(NLA_U32, + 31 /* BITS_PER_U32 - 1 */), [TCA_FLOW_ADDEND] = { .type = NLA_U32 }, [TCA_FLOW_MASK] = { .type = NLA_U32 }, [TCA_FLOW_XOR] = { .type = NLA_U32 }, diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index deb0925f536d..48dd8c88903f 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -630,6 +630,63 @@ static bool cake_ddst(int flow_mode) return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; } +static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_dsrc(flow_mode) && + q->hosts[flow->srchost].srchost_bulk_flow_count)) + q->hosts[flow->srchost].srchost_bulk_flow_count--; +} + +static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_dsrc(flow_mode) && + q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES)) + q->hosts[flow->srchost].srchost_bulk_flow_count++; +} + +static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_ddst(flow_mode) && + q->hosts[flow->dsthost].dsthost_bulk_flow_count)) + q->hosts[flow->dsthost].dsthost_bulk_flow_count--; +} + +static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_ddst(flow_mode) && + q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES)) + q->hosts[flow->dsthost].dsthost_bulk_flow_count++; +} + +static u16 cake_get_flow_quantum(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + u16 host_load = 1; + + if (cake_dsrc(flow_mode)) + host_load = max(host_load, + q->hosts[flow->srchost].srchost_bulk_flow_count); + + if (cake_ddst(flow_mode)) + host_load = max(host_load, + q->hosts[flow->dsthost].dsthost_bulk_flow_count); + + /* The get_random_u16() is a way to apply dithering to avoid + * accumulating roundoff errors + */ + return (q->flow_quantum * quantum_div[host_load] + + get_random_u16()) >> 16; +} + static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, int flow_mode, u16 flow_override, u16 host_override) { @@ -776,10 +833,8 @@ skip_hash: allocate_dst = cake_ddst(flow_mode); if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { - if (allocate_src) - q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--; - if (allocate_dst) - q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--; + cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); + cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); } found: /* reserve queue for future packets in same flow */ @@ -804,9 +859,10 @@ found: q->hosts[outer_hash + k].srchost_tag = srchost_hash; found_src: srchost_idx = outer_hash + k; - if (q->flows[reduced_hash].set == CAKE_SET_BULK) - q->hosts[srchost_idx].srchost_bulk_flow_count++; q->flows[reduced_hash].srchost = srchost_idx; + + if (q->flows[reduced_hash].set == CAKE_SET_BULK) + cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); } if (allocate_dst) { @@ -827,9 +883,10 @@ found_src: q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; found_dst: dsthost_idx = outer_hash + k; - if (q->flows[reduced_hash].set == CAKE_SET_BULK) - q->hosts[dsthost_idx].dsthost_bulk_flow_count++; q->flows[reduced_hash].dsthost = dsthost_idx; + + if (q->flows[reduced_hash].set == CAKE_SET_BULK) + cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); } } @@ -1841,10 +1898,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* flowchain */ if (!flow->set || flow->set == CAKE_SET_DECAYING) { - struct cake_host *srchost = &b->hosts[flow->srchost]; - struct cake_host *dsthost = &b->hosts[flow->dsthost]; - u16 host_load = 1; - if (!flow->set) { list_add_tail(&flow->flowchain, &b->new_flows); } else { @@ -1854,18 +1907,8 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, flow->set = CAKE_SET_SPARSE; b->sparse_flow_count++; - if (cake_dsrc(q->flow_mode)) - host_load = max(host_load, srchost->srchost_bulk_flow_count); - - if (cake_ddst(q->flow_mode)) - host_load = max(host_load, dsthost->dsthost_bulk_flow_count); - - flow->deficit = (b->flow_quantum * - quantum_div[host_load]) >> 16; + flow->deficit = cake_get_flow_quantum(b, flow, q->flow_mode); } else if (flow->set == CAKE_SET_SPARSE_WAIT) { - struct cake_host *srchost = &b->hosts[flow->srchost]; - struct cake_host *dsthost = &b->hosts[flow->dsthost]; - /* this flow was empty, accounted as a sparse flow, but actually * in the bulk rotation. */ @@ -1873,12 +1916,8 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, b->sparse_flow_count--; b->bulk_flow_count++; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count++; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count++; - + cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); } if (q->buffer_used > q->buffer_max_used) @@ -1935,14 +1974,12 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[q->cur_tin]; - struct cake_host *srchost, *dsthost; enum skb_drop_reason reason; ktime_t now = ktime_get(); struct cake_flow *flow; struct list_head *head; bool first_flow = true; struct sk_buff *skb; - u16 host_load; u64 delay; u32 len; @@ -2042,11 +2079,6 @@ retry: q->cur_flow = flow - b->flows; first_flow = false; - /* triple isolation (modified DRR++) */ - srchost = &b->hosts[flow->srchost]; - dsthost = &b->hosts[flow->dsthost]; - host_load = 1; - /* flow isolation (DRR++) */ if (flow->deficit <= 0) { /* Keep all flows with deficits out of the sparse and decaying @@ -2058,11 +2090,8 @@ retry: b->sparse_flow_count--; b->bulk_flow_count++; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count++; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count++; + cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); flow->set = CAKE_SET_BULK; } else { @@ -2074,19 +2103,7 @@ retry: } } - if (cake_dsrc(q->flow_mode)) - host_load = max(host_load, srchost->srchost_bulk_flow_count); - - if (cake_ddst(q->flow_mode)) - host_load = max(host_load, dsthost->dsthost_bulk_flow_count); - - WARN_ON(host_load > CAKE_QUEUES); - - /* The get_random_u16() is a way to apply dithering to avoid - * accumulating roundoff errors - */ - flow->deficit += (b->flow_quantum * quantum_div[host_load] + - get_random_u16()) >> 16; + flow->deficit += cake_get_flow_quantum(b, flow, q->flow_mode); list_move_tail(&flow->flowchain, &b->old_flows); goto retry; @@ -2110,11 +2127,8 @@ retry: if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count--; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count--; + cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); b->decaying_flow_count++; } else if (flow->set == CAKE_SET_SPARSE || @@ -2132,12 +2146,8 @@ retry: else if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count--; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count--; - + cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); } else b->decaying_flow_count--; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 8874ae668095..14ab2f4c190a 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -551,25 +551,20 @@ static void dev_watchdog(struct timer_list *t) netdev_put(dev, &dev->watchdog_dev_tracker); } -void __netdev_watchdog_up(struct net_device *dev) -{ - if (dev->netdev_ops->ndo_tx_timeout) { - if (dev->watchdog_timeo <= 0) - dev->watchdog_timeo = 5*HZ; - if (!mod_timer(&dev->watchdog_timer, - round_jiffies(jiffies + dev->watchdog_timeo))) - netdev_hold(dev, &dev->watchdog_dev_tracker, - GFP_ATOMIC); - } -} -EXPORT_SYMBOL_GPL(__netdev_watchdog_up); - -static void dev_watchdog_up(struct net_device *dev) +void netdev_watchdog_up(struct net_device *dev) { - __netdev_watchdog_up(dev); + if (!dev->netdev_ops->ndo_tx_timeout) + return; + if (dev->watchdog_timeo <= 0) + dev->watchdog_timeo = 5*HZ; + if (!mod_timer(&dev->watchdog_timer, + round_jiffies(jiffies + dev->watchdog_timeo))) + netdev_hold(dev, &dev->watchdog_dev_tracker, + GFP_ATOMIC); } +EXPORT_SYMBOL_GPL(netdev_watchdog_up); -static void dev_watchdog_down(struct net_device *dev) +static void netdev_watchdog_down(struct net_device *dev) { netif_tx_lock_bh(dev); if (del_timer(&dev->watchdog_timer)) @@ -591,7 +586,7 @@ void netif_carrier_on(struct net_device *dev) atomic_inc(&dev->carrier_up_count); linkwatch_fire_event(dev); if (netif_running(dev)) - __netdev_watchdog_up(dev); + netdev_watchdog_up(dev); } } EXPORT_SYMBOL(netif_carrier_on); @@ -1267,7 +1262,7 @@ void dev_activate(struct net_device *dev) if (need_watchdog) { netif_trans_update(dev); - dev_watchdog_up(dev); + netdev_watchdog_up(dev); } } EXPORT_SYMBOL(dev_activate); @@ -1282,15 +1277,17 @@ static void qdisc_deactivate(struct Qdisc *qdisc) static void dev_deactivate_queue(struct net_device *dev, struct netdev_queue *dev_queue, - void *_qdisc_default) + void *_sync_needed) { - struct Qdisc *qdisc_default = _qdisc_default; + bool *sync_needed = _sync_needed; struct Qdisc *qdisc; qdisc = rtnl_dereference(dev_queue->qdisc); if (qdisc) { + if (qdisc->enqueue) + *sync_needed = true; qdisc_deactivate(qdisc); - rcu_assign_pointer(dev_queue->qdisc, qdisc_default); + rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc); } } @@ -1357,24 +1354,22 @@ static bool some_qdisc_is_busy(struct net_device *dev) */ void dev_deactivate_many(struct list_head *head) { + bool sync_needed = false; struct net_device *dev; list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_deactivate_queue, - &noop_qdisc); + &sync_needed); if (dev_ingress_queue(dev)) dev_deactivate_queue(dev, dev_ingress_queue(dev), - &noop_qdisc); + &sync_needed); - dev_watchdog_down(dev); + netdev_watchdog_down(dev); } - /* Wait for outstanding qdisc-less dev_queue_xmit calls or - * outstanding qdisc enqueuing calls. - * This is avoided if all devices are in dismantle phase : - * Caller will call synchronize_net() for us - */ - synchronize_net(); + /* Wait for outstanding qdisc enqueuing calls. */ + if (sync_needed) + synchronize_net(); list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_reset_queue, NULL); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 8b9a1b96695e..29727ed1008e 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -43,6 +43,7 @@ #include <net/addrconf.h> #include <net/inet_common.h> #include <net/inet_ecn.h> +#include <net/inet_sock.h> #include <net/udp_tunnel.h> #include <net/inet_dscp.h> @@ -427,16 +428,19 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, struct dst_entry *dst = NULL; union sctp_addr *daddr = &t->ipaddr; union sctp_addr dst_saddr; - u8 tos = READ_ONCE(inet_sk(sk)->tos); + dscp_t dscp; if (t->dscp & SCTP_DSCP_SET_MASK) - tos = t->dscp & SCTP_DSCP_VAL_MASK; + dscp = inet_dsfield_to_dscp(t->dscp); + else + dscp = inet_sk_dscp(inet_sk(sk)); + memset(&_fl, 0x0, sizeof(_fl)); fl4->daddr = daddr->v4.sin_addr.s_addr; fl4->fl4_dport = daddr->v4.sin_port; fl4->flowi4_proto = IPPROTO_SCTP; if (asoc) { - fl4->flowi4_tos = tos & INET_DSCP_MASK; + fl4->flowi4_tos = inet_dscp_to_dsfield(dscp); fl4->flowi4_scope = ip_sock_rt_scope(asoc->base.sk); fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; fl4->fl4_sport = htons(asoc->base.bind_addr.port); diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index e5a5af343c4c..8e1e97be4df7 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -387,7 +387,8 @@ static struct ctl_table sctp_net_table[] = { static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, + sctp.sctp_hmac_alg); struct ctl_table tbl; bool changed = false; char *none = "none"; @@ -432,7 +433,7 @@ static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, static int proc_sctp_do_rto_min(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, sctp.rto_min); unsigned int min = *(unsigned int *) ctl->extra1; unsigned int max = *(unsigned int *) ctl->extra2; struct ctl_table tbl; @@ -460,7 +461,7 @@ static int proc_sctp_do_rto_min(const struct ctl_table *ctl, int write, static int proc_sctp_do_rto_max(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, sctp.rto_max); unsigned int min = *(unsigned int *) ctl->extra1; unsigned int max = *(unsigned int *) ctl->extra2; struct ctl_table tbl; @@ -498,7 +499,7 @@ static int proc_sctp_do_alpha_beta(const struct ctl_table *ctl, int write, static int proc_sctp_do_auth(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, sctp.auth_enable); struct ctl_table tbl; int new_value, ret; @@ -527,7 +528,7 @@ static int proc_sctp_do_auth(const struct ctl_table *ctl, int write, static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, sctp.udp_port); unsigned int min = *(unsigned int *)ctl->extra1; unsigned int max = *(unsigned int *)ctl->extra2; struct ctl_table tbl; @@ -568,7 +569,8 @@ static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write, static int proc_sctp_do_probe_interval(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, + sctp.probe_interval); struct ctl_table tbl; int ret, new_value; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index c370efcfe3e8..ca6984541edb 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2741,7 +2741,7 @@ int smc_accept(struct socket *sock, struct socket *new_sock, release_sock(clcsk); } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) { lock_sock(nsk); - smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available); + smc_rx_wait(smc_sk(nsk), &timeo, 0, smc_rx_data_available); release_sock(nsk); } } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index ccf57b7fe602..ac07b963aede 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -2155,7 +2155,7 @@ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, for_each_sg(buf_desc->sgt[lnk->link_idx].sgl, sg, nents, i) { size = min_t(int, PAGE_SIZE - offset, buf_size); sg_set_page(sg, vmalloc_to_page(buf), size, offset); - buf += size / sizeof(*buf); + buf += size; buf_size -= size; offset = 0; } diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index f0cbe77a80b4..e7f1134453ef 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -197,7 +197,7 @@ static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len, partial[i].offset = offset; partial[i].len = size; partial[i].private = (unsigned long)priv[i]; - buf += size / sizeof(*buf); + buf += size; left -= size; offset = 0; } @@ -238,22 +238,23 @@ out: return -ENOMEM; } -static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn) +static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn, size_t peeked) { - return atomic_read(&conn->bytes_to_rcv) && + return smc_rx_data_available(conn, peeked) && !atomic_read(&conn->splice_pending); } /* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted * @smc smc socket * @timeo pointer to max seconds to wait, pointer to value 0 for no timeout + * @peeked number of bytes already peeked * @fcrit add'l criterion to evaluate as function pointer * Returns: * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown. * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted). */ -int smc_rx_wait(struct smc_sock *smc, long *timeo, - int (*fcrit)(struct smc_connection *conn)) +int smc_rx_wait(struct smc_sock *smc, long *timeo, size_t peeked, + int (*fcrit)(struct smc_connection *conn, size_t baseline)) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct smc_connection *conn = &smc->conn; @@ -262,7 +263,7 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, struct sock *sk = &smc->sk; int rc; - if (fcrit(conn)) + if (fcrit(conn, peeked)) return 1; sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); add_wait_queue(sk_sleep(sk), &wait); @@ -271,7 +272,7 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, cflags->peer_conn_abort || READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN || conn->killed || - fcrit(conn), + fcrit(conn, peeked), &wait); remove_wait_queue(sk_sleep(sk), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); @@ -322,11 +323,11 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len, return -EAGAIN; } -static bool smc_rx_recvmsg_data_available(struct smc_sock *smc) +static bool smc_rx_recvmsg_data_available(struct smc_sock *smc, size_t peeked) { struct smc_connection *conn = &smc->conn; - if (smc_rx_data_available(conn)) + if (smc_rx_data_available(conn, peeked)) return true; else if (conn->urg_state == SMC_URG_VALID) /* we received a single urgent Byte - skip */ @@ -344,10 +345,10 @@ static bool smc_rx_recvmsg_data_available(struct smc_sock *smc) int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, struct pipe_inode_info *pipe, size_t len, int flags) { - size_t copylen, read_done = 0, read_remaining = len; + size_t copylen, read_done = 0, read_remaining = len, peeked_bytes = 0; size_t chunk_len, chunk_off, chunk_len_sum; struct smc_connection *conn = &smc->conn; - int (*func)(struct smc_connection *conn); + int (*func)(struct smc_connection *conn, size_t baseline); union smc_host_cursor cons; int readable, chunk; char *rcvbuf_base; @@ -384,14 +385,14 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, if (conn->killed) break; - if (smc_rx_recvmsg_data_available(smc)) + if (smc_rx_recvmsg_data_available(smc, peeked_bytes)) goto copy; if (sk->sk_shutdown & RCV_SHUTDOWN) { /* smc_cdc_msg_recv_action() could have run after * above smc_rx_recvmsg_data_available() */ - if (smc_rx_recvmsg_data_available(smc)) + if (smc_rx_recvmsg_data_available(smc, peeked_bytes)) goto copy; break; } @@ -425,26 +426,28 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, } } - if (!smc_rx_data_available(conn)) { - smc_rx_wait(smc, &timeo, smc_rx_data_available); + if (!smc_rx_data_available(conn, peeked_bytes)) { + smc_rx_wait(smc, &timeo, peeked_bytes, smc_rx_data_available); continue; } copy: /* initialize variables for 1st iteration of subsequent loop */ /* could be just 1 byte, even after waiting on data above */ - readable = atomic_read(&conn->bytes_to_rcv); + readable = smc_rx_data_available(conn, peeked_bytes); splbytes = atomic_read(&conn->splice_pending); if (!readable || (msg && splbytes)) { if (splbytes) func = smc_rx_data_available_and_no_splice_pend; else func = smc_rx_data_available; - smc_rx_wait(smc, &timeo, func); + smc_rx_wait(smc, &timeo, peeked_bytes, func); continue; } smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); + if ((flags & MSG_PEEK) && peeked_bytes) + smc_curs_add(conn->rmb_desc->len, &cons, peeked_bytes); /* subsequent splice() calls pick up where previous left */ if (splbytes) smc_curs_add(conn->rmb_desc->len, &cons, splbytes); @@ -480,6 +483,8 @@ copy: } read_remaining -= chunk_len; read_done += chunk_len; + if (flags & MSG_PEEK) + peeked_bytes += chunk_len; if (chunk_len_sum == copylen) break; /* either on 1st or 2nd iteration */ diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h index db823c97d824..994f5e42d1ba 100644 --- a/net/smc/smc_rx.h +++ b/net/smc/smc_rx.h @@ -21,11 +21,11 @@ void smc_rx_init(struct smc_sock *smc); int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, struct pipe_inode_info *pipe, size_t len, int flags); -int smc_rx_wait(struct smc_sock *smc, long *timeo, - int (*fcrit)(struct smc_connection *conn)); -static inline int smc_rx_data_available(struct smc_connection *conn) +int smc_rx_wait(struct smc_sock *smc, long *timeo, size_t peeked, + int (*fcrit)(struct smc_connection *conn, size_t baseline)); +static inline int smc_rx_data_available(struct smc_connection *conn, size_t peeked) { - return atomic_read(&conn->bytes_to_rcv); + return atomic_read(&conn->bytes_to_rcv) - peeked; } #endif /* SMC_RX_H */ diff --git a/net/socket.c b/net/socket.c index 16402b8be5a7..4afe31656a2b 100644 --- a/net/socket.c +++ b/net/socket.c @@ -110,6 +110,8 @@ #include <linux/ptp_clock_kernel.h> #include <trace/events/sock.h> +#include "core/dev.h" + #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sysctl_net_busy_read __read_mostly; unsigned int sysctl_net_busy_poll __read_mostly; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 9ee5a83c5b40..99ca4465f702 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -737,6 +737,10 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, else ctx->rx_conf = conf; update_sk_prot(sk, ctx); + + if (update) + return 0; + if (tx) { ctx->sk_write_space = sk->sk_write_space; sk->sk_write_space = tls_write_space; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 47550d485819..914d4e1516a3 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -458,7 +458,7 @@ int tls_tx_records(struct sock *sk, int flags) tx_err: if (rc < 0 && rc != -EAGAIN) - tls_err_abort(sk, -EBADMSG); + tls_err_abort(sk, rc); return rc; } diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index d7b16f2c23e9..f0157702718f 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -135,6 +135,22 @@ config NET_KEY_MIGRATE If unsure, say N. +config XFRM_IPTFS + tristate "IPsec IP-TFS/AGGFRAG (RFC 9347) encapsulation support" + depends on XFRM + help + Information on the IP-TFS/AGGFRAG encapsulation can be found + in RFC 9347. This feature supports demand driven (i.e., + non-constant send rate) IP-TFS to take advantage of the + AGGFRAG ESP payload encapsulation. This payload type + supports aggregation and fragmentation of the inner IP + packet stream which in turn yields higher small-packet + bandwidth as well as reducing MTU/PMTU issues. Congestion + control is unimplementated as the send rate is demand driven + rather than constant. + + If unsure, say N. + config XFRM_ESPINTCP bool diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index 512e0b2f8514..5a1787587cb3 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -21,5 +21,6 @@ obj-$(CONFIG_XFRM_USER) += xfrm_user.o obj-$(CONFIG_XFRM_USER_COMPAT) += xfrm_compat.o obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o +obj-$(CONFIG_XFRM_IPTFS) += xfrm_iptfs.o obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o obj-$(CONFIG_DEBUG_INFO_BTF) += xfrm_state_bpf.o diff --git a/net/xfrm/trace_iptfs.h b/net/xfrm/trace_iptfs.h new file mode 100644 index 000000000000..74391ba24445 --- /dev/null +++ b/net/xfrm/trace_iptfs.h @@ -0,0 +1,218 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* xfrm_trace_iptfs.h + * + * August 12 2023, Christian Hopps <chopps@labn.net> + * + * Copyright (c) 2023, LabN Consulting, L.L.C. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM iptfs + +#if !defined(_TRACE_IPTFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_IPTFS_H + +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/tracepoint.h> +#include <net/ip.h> + +struct xfrm_iptfs_data; + +TRACE_EVENT(iptfs_egress_recv, + TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u16 blkoff), + TP_ARGS(skb, xtfs, blkoff), + TP_STRUCT__entry(__field(struct sk_buff *, skb) + __field(void *, head) + __field(void *, head_pg_addr) + __field(void *, pg0addr) + __field(u32, skb_len) + __field(u32, data_len) + __field(u32, headroom) + __field(u32, tailroom) + __field(u32, tail) + __field(u32, end) + __field(u32, pg0off) + __field(u8, head_frag) + __field(u8, frag_list) + __field(u8, nr_frags) + __field(u16, blkoff)), + TP_fast_assign(__entry->skb = skb; + __entry->head = skb->head; + __entry->skb_len = skb->len; + __entry->data_len = skb->data_len; + __entry->headroom = skb_headroom(skb); + __entry->tailroom = skb_tailroom(skb); + __entry->tail = (u32)skb->tail; + __entry->end = (u32)skb->end; + __entry->head_frag = skb->head_frag; + __entry->frag_list = (bool)skb_shinfo(skb)->frag_list; + __entry->nr_frags = skb_shinfo(skb)->nr_frags; + __entry->blkoff = blkoff; + __entry->head_pg_addr = page_address(virt_to_head_page(skb->head)); + __entry->pg0addr = (__entry->nr_frags + ? page_address(netmem_to_page(skb_shinfo(skb)->frags[0].netmem)) + : NULL); + __entry->pg0off = (__entry->nr_frags + ? skb_shinfo(skb)->frags[0].offset + : 0); + ), + TP_printk("EGRESS: skb=%p len=%u data_len=%u headroom=%u head_frag=%u frag_list=%u nr_frags=%u blkoff=%u\n\t\ttailroom=%u tail=%u end=%u head=%p hdpgaddr=%p pg0->addr=%p pg0->data=%p pg0->off=%u", + __entry->skb, __entry->skb_len, __entry->data_len, __entry->headroom, + __entry->head_frag, __entry->frag_list, __entry->nr_frags, __entry->blkoff, + __entry->tailroom, __entry->tail, __entry->end, __entry->head, + __entry->head_pg_addr, __entry->pg0addr, __entry->pg0addr + __entry->pg0off, + __entry->pg0off) + ) + +DECLARE_EVENT_CLASS(iptfs_ingress_preq_event, + TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, + u32 pmtu, u8 was_gso), + TP_ARGS(skb, xtfs, pmtu, was_gso), + TP_STRUCT__entry(__field(struct sk_buff *, skb) + __field(u32, skb_len) + __field(u32, data_len) + __field(u32, pmtu) + __field(u32, queue_size) + __field(u32, proto_seq) + __field(u8, proto) + __field(u8, was_gso) + ), + TP_fast_assign(__entry->skb = skb; + __entry->skb_len = skb->len; + __entry->data_len = skb->data_len; + __entry->queue_size = + xtfs->cfg.max_queue_size - xtfs->queue_size; + __entry->proto = __trace_ip_proto(ip_hdr(skb)); + __entry->proto_seq = __trace_ip_proto_seq(ip_hdr(skb)); + __entry->pmtu = pmtu; + __entry->was_gso = was_gso; + ), + TP_printk("INGRPREQ: skb=%p len=%u data_len=%u qsize=%u proto=%u proto_seq=%u pmtu=%u was_gso=%u", + __entry->skb, __entry->skb_len, __entry->data_len, + __entry->queue_size, __entry->proto, __entry->proto_seq, + __entry->pmtu, __entry->was_gso)); + +DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_enqueue, + TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso), + TP_ARGS(skb, xtfs, pmtu, was_gso)); + +DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_no_queue_space, + TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso), + TP_ARGS(skb, xtfs, pmtu, was_gso)); + +DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_too_big, + TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso), + TP_ARGS(skb, xtfs, pmtu, was_gso)); + +DECLARE_EVENT_CLASS(iptfs_ingress_postq_event, + TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, struct iphdr *iph), + TP_ARGS(skb, mtu, blkoff, iph), + TP_STRUCT__entry(__field(struct sk_buff *, skb) + __field(u32, skb_len) + __field(u32, data_len) + __field(u32, mtu) + __field(u32, proto_seq) + __field(u16, blkoff) + __field(u8, proto)), + TP_fast_assign(__entry->skb = skb; + __entry->skb_len = skb->len; + __entry->data_len = skb->data_len; + __entry->mtu = mtu; + __entry->blkoff = blkoff; + __entry->proto = iph ? __trace_ip_proto(iph) : 0; + __entry->proto_seq = iph ? __trace_ip_proto_seq(iph) : 0; + ), + TP_printk("INGRPSTQ: skb=%p len=%u data_len=%u mtu=%u blkoff=%u proto=%u proto_seq=%u", + __entry->skb, __entry->skb_len, __entry->data_len, __entry->mtu, + __entry->blkoff, __entry->proto, __entry->proto_seq)); + +DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_dequeue, + TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, + struct iphdr *iph), + TP_ARGS(skb, mtu, blkoff, iph)); + +DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_fragmenting, + TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, + struct iphdr *iph), + TP_ARGS(skb, mtu, blkoff, iph)); + +DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_final_fragment, + TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, + struct iphdr *iph), + TP_ARGS(skb, mtu, blkoff, iph)); + +DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_toobig, + TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, + struct iphdr *iph), + TP_ARGS(skb, mtu, blkoff, iph)); + +TRACE_EVENT(iptfs_ingress_nth_peek, + TP_PROTO(struct sk_buff *skb, u32 remaining), + TP_ARGS(skb, remaining), + TP_STRUCT__entry(__field(struct sk_buff *, skb) + __field(u32, skb_len) + __field(u32, remaining)), + TP_fast_assign(__entry->skb = skb; + __entry->skb_len = skb->len; + __entry->remaining = remaining; + ), + TP_printk("INGRPSTQ: NTHPEEK: skb=%p len=%u remaining=%u", + __entry->skb, __entry->skb_len, __entry->remaining)); + +TRACE_EVENT(iptfs_ingress_nth_add, TP_PROTO(struct sk_buff *skb, u8 share_ok), + TP_ARGS(skb, share_ok), + TP_STRUCT__entry(__field(struct sk_buff *, skb) + __field(u32, skb_len) + __field(u32, data_len) + __field(u8, share_ok) + __field(u8, head_frag) + __field(u8, pp_recycle) + __field(u8, cloned) + __field(u8, shared) + __field(u8, nr_frags) + __field(u8, frag_list) + ), + TP_fast_assign(__entry->skb = skb; + __entry->skb_len = skb->len; + __entry->data_len = skb->data_len; + __entry->share_ok = share_ok; + __entry->head_frag = skb->head_frag; + __entry->pp_recycle = skb->pp_recycle; + __entry->cloned = skb_cloned(skb); + __entry->shared = skb_shared(skb); + __entry->nr_frags = skb_shinfo(skb)->nr_frags; + __entry->frag_list = (bool)skb_shinfo(skb)->frag_list; + ), + TP_printk("INGRPSTQ: NTHADD: skb=%p len=%u data_len=%u share_ok=%u head_frag=%u pp_recycle=%u cloned=%u shared=%u nr_frags=%u frag_list=%u", + __entry->skb, __entry->skb_len, __entry->data_len, __entry->share_ok, + __entry->head_frag, __entry->pp_recycle, __entry->cloned, __entry->shared, + __entry->nr_frags, __entry->frag_list)); + +DECLARE_EVENT_CLASS(iptfs_timer_event, + TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val), + TP_ARGS(xtfs, time_val), + TP_STRUCT__entry(__field(u64, time_val) + __field(u64, set_time)), + TP_fast_assign(__entry->time_val = time_val; + __entry->set_time = xtfs->iptfs_settime; + ), + TP_printk("TIMER: set_time=%llu time_val=%llu", + __entry->set_time, __entry->time_val)); + +DEFINE_EVENT(iptfs_timer_event, iptfs_timer_start, + TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val), + TP_ARGS(xtfs, time_val)); + +DEFINE_EVENT(iptfs_timer_event, iptfs_timer_expire, + TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val), + TP_ARGS(xtfs, time_val)); + +#endif /* _TRACE_IPTFS_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../net/xfrm +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace_iptfs +#include <trace/define_trace.h> diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index 5b9ee63e30b6..b8d2e6930041 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -284,9 +284,15 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src) case XFRMA_SA_DIR: case XFRMA_NAT_KEEPALIVE_INTERVAL: case XFRMA_SA_PCPU: + case XFRMA_IPTFS_DROP_TIME: + case XFRMA_IPTFS_REORDER_WINDOW: + case XFRMA_IPTFS_DONT_FRAG: + case XFRMA_IPTFS_INIT_DELAY: + case XFRMA_IPTFS_MAX_QSIZE: + case XFRMA_IPTFS_PKT_SIZE: return xfrm_nla_cpy(dst, src, nla_len(src)); default: - BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_PCPU); + BUILD_BUG_ON(XFRMA_MAX != XFRMA_IPTFS_PKT_SIZE); pr_warn_once("unsupported nla_type %d\n", src->nla_type); return -EOPNOTSUPP; } @@ -441,7 +447,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla, int err; if (type > XFRMA_MAX) { - BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_PCPU); + BUILD_BUG_ON(XFRMA_MAX != XFRMA_IPTFS_PKT_SIZE); NL_SET_ERR_MSG(extack, "Bad attribute"); return -EOPNOTSUPP; } diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index b33c4591e09a..d1fa94e52cea 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -42,7 +42,8 @@ static void __xfrm_mode_tunnel_prep(struct xfrm_state *x, struct sk_buff *skb, skb->transport_header = skb->network_header + hsize; skb_reset_mac_len(skb); - pskb_pull(skb, skb->mac_len + x->props.header_len); + pskb_pull(skb, + skb->mac_len + x->props.header_len - x->props.enc_hdr_len); } static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb, @@ -68,6 +69,7 @@ static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb, static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb) { switch (x->outer_mode.encap) { + case XFRM_MODE_IPTFS: case XFRM_MODE_TUNNEL: if (x->outer_mode.family == AF_INET) return __xfrm_mode_tunnel_prep(x, skb, diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 841a60a6fbfe..7e6a71b9d6a3 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -48,7 +48,7 @@ static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[2][AF_INET6 + 1]; static struct gro_cells gro_cells; -static struct net_device xfrm_napi_dev; +static struct net_device *xfrm_napi_dev; static DEFINE_PER_CPU(struct xfrm_trans_tasklet, xfrm_trans_tasklet); @@ -446,6 +446,9 @@ static int xfrm_inner_mode_input(struct xfrm_state *x, WARN_ON_ONCE(1); break; default: + if (x->mode_cbs && x->mode_cbs->input) + return x->mode_cbs->input(x, skb); + WARN_ON_ONCE(1); break; } @@ -453,6 +456,10 @@ static int xfrm_inner_mode_input(struct xfrm_state *x, return -EOPNOTSUPP; } +/* NOTE: encap_type - In addition to the normal (non-negative) values for + * encap_type, a negative value of -1 or -2 can be used to resume/restart this + * function after a previous invocation early terminated for async operation. + */ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { const struct xfrm_state_afinfo *afinfo; @@ -489,6 +496,10 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) family = x->props.family; + /* An encap_type of -2 indicates reconstructed inner packet */ + if (encap_type == -2) + goto resume_decapped; + /* An encap_type of -1 indicates async resumption. */ if (encap_type == -1) { async = 1; @@ -679,11 +690,14 @@ resume: XFRM_MODE_SKB_CB(skb)->protocol = nexthdr; - if (xfrm_inner_mode_input(x, skb)) { + err = xfrm_inner_mode_input(x, skb); + if (err == -EINPROGRESS) + return 0; + else if (err) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); goto drop; } - +resume_decapped: if (x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL) { decaps = 1; break; @@ -811,8 +825,11 @@ void __init xfrm_input_init(void) int err; int i; - init_dummy_netdev(&xfrm_napi_dev); - err = gro_cells_init(&gro_cells, &xfrm_napi_dev); + xfrm_napi_dev = alloc_netdev_dummy(0); + if (!xfrm_napi_dev) + panic("Failed to allocate XFRM dummy netdev\n"); + + err = gro_cells_init(&gro_cells, xfrm_napi_dev); if (err) gro_cells.cells = NULL; diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c new file mode 100644 index 000000000000..755f1eea8bfa --- /dev/null +++ b/net/xfrm/xfrm_iptfs.c @@ -0,0 +1,2764 @@ +// SPDX-License-Identifier: GPL-2.0 +/* xfrm_iptfs: IPTFS encapsulation support + * + * April 21 2022, Christian Hopps <chopps@labn.net> + * + * Copyright (c) 2022, LabN Consulting, L.L.C. + * + */ + +#include <linux/kernel.h> +#include <linux/icmpv6.h> +#include <linux/skbuff_ref.h> +#include <net/gro.h> +#include <net/icmp.h> +#include <net/ip6_route.h> +#include <net/inet_ecn.h> +#include <net/xfrm.h> + +#include <crypto/aead.h> + +#include "xfrm_inout.h" +#include "trace_iptfs.h" + +/* IPTFS encap (header) values. */ +#define IPTFS_SUBTYPE_BASIC 0 +#define IPTFS_SUBTYPE_CC 1 + +/* ----------------------------------------------- */ +/* IP-TFS default SA values (tunnel egress/dir-in) */ +/* ----------------------------------------------- */ + +/** + * define IPTFS_DEFAULT_DROP_TIME_USECS - default drop time + * + * The default IPTFS drop time in microseconds. The drop time is the amount of + * time before a missing out-of-order IPTFS tunnel packet is considered lost. + * See also the reorder window. + * + * Default 1s. + */ +#define IPTFS_DEFAULT_DROP_TIME_USECS 1000000 + +/** + * define IPTFS_DEFAULT_REORDER_WINDOW - default reorder window size + * + * The default IPTFS reorder window size. The reorder window size dictates the + * maximum number of IPTFS tunnel packets in a sequence that may arrive out of + * order. + * + * Default 3. (tcp folks suggested) + */ +#define IPTFS_DEFAULT_REORDER_WINDOW 3 + +/* ------------------------------------------------ */ +/* IPTFS default SA values (tunnel ingress/dir-out) */ +/* ------------------------------------------------ */ + +/** + * define IPTFS_DEFAULT_INIT_DELAY_USECS - default initial output delay + * + * The initial output delay is the amount of time prior to servicing the output + * queue after queueing the first packet on said queue. This applies anytime the + * output queue was previously empty. + * + * Default 0. + */ +#define IPTFS_DEFAULT_INIT_DELAY_USECS 0 + +/** + * define IPTFS_DEFAULT_MAX_QUEUE_SIZE - default max output queue size. + * + * The default IPTFS max output queue size in octets. The output queue is where + * received packets destined for output over an IPTFS tunnel are stored prior to + * being output in aggregated/fragmented form over the IPTFS tunnel. + * + * Default 1M. + */ +#define IPTFS_DEFAULT_MAX_QUEUE_SIZE (1024 * 10240) + +/* Assumed: skb->head is cache aligned. + * + * L2 Header resv: Arrange for cacheline to start at skb->data - 16 to keep the + * to-be-pushed L2 header in the same cacheline as resulting `skb->data` (i.e., + * the L3 header). If cacheline size is > 64 then skb->data + pushed L2 will all + * be in a single cacheline if we simply reserve 64 bytes. + * + * L3 Header resv: For L3+L2 headers (i.e., skb->data points at the IPTFS payload) + * we want `skb->data` to be cacheline aligned and all pushed L2L3 headers will + * be in their own cacheline[s]. 128 works for cachelins up to 128 bytes, for + * any larger cacheline sizes the pushed headers will simply share the cacheline + * with the start of the IPTFS payload (skb->data). + */ +#define XFRM_IPTFS_MIN_L3HEADROOM 128 +#define XFRM_IPTFS_MIN_L2HEADROOM (L1_CACHE_BYTES > 64 ? 64 : 64 + 16) + +/* Min to try to share outer iptfs skb data vs copying into new skb */ +#define IPTFS_PKT_SHARE_MIN 129 + +#define NSECS_IN_USEC 1000 + +#define IPTFS_HRTIMER_MODE HRTIMER_MODE_REL_SOFT + +/** + * struct xfrm_iptfs_config - configuration for the IPTFS tunnel. + * @pkt_size: size of the outer IP packet. 0 to use interface and MTU discovery, + * otherwise the user specified value. + * @max_queue_size: The maximum number of octets allowed to be queued to be sent + * over the IPTFS SA. The queue size is measured as the size of all the + * packets enqueued. + * @reorder_win_size: the number slots in the reorder window, thus the number of + * packets that may arrive out of order. + * @dont_frag: true to inhibit fragmenting across IPTFS outer packets. + */ +struct xfrm_iptfs_config { + u32 pkt_size; /* outer_packet_size or 0 */ + u32 max_queue_size; /* octets */ + u16 reorder_win_size; + u8 dont_frag : 1; +}; + +struct skb_wseq { + struct sk_buff *skb; + u64 drop_time; +}; + +/** + * struct xfrm_iptfs_data - mode specific xfrm state. + * @cfg: IPTFS tunnel config. + * @x: owning SA (xfrm_state). + * @queue: queued user packets to send. + * @queue_size: number of octets on queue (sum of packet sizes). + * @ecn_queue_size: octets above with ECN mark. + * @init_delay_ns: nanoseconds to wait to send initial IPTFS packet. + * @iptfs_timer: output timer. + * @iptfs_settime: time the output timer was set. + * @payload_mtu: max payload size. + * @w_seq_set: true after first seq received. + * @w_wantseq: waiting for this seq number as next to process (in order). + * @w_saved: the saved buf array (reorder window). + * @w_savedlen: the saved len (not size). + * @drop_lock: lock to protect reorder queue. + * @drop_timer: timer for considering next packet lost. + * @drop_time_ns: timer intervan in nanoseconds. + * @ra_newskb: new pkt being reassembled. + * @ra_wantseq: expected next sequence for reassembly. + * @ra_runt: last pkt bytes from very end of last skb. + * @ra_runtlen: size of ra_runt. + */ +struct xfrm_iptfs_data { + struct xfrm_iptfs_config cfg; + + /* Ingress User Input */ + struct xfrm_state *x; /* owning state */ + struct sk_buff_head queue; /* output queue */ + + u32 queue_size; /* octets */ + u32 ecn_queue_size; /* octets above which ECN mark */ + u64 init_delay_ns; /* nanoseconds */ + struct hrtimer iptfs_timer; /* output timer */ + time64_t iptfs_settime; /* time timer was set */ + u32 payload_mtu; /* max payload size */ + + /* Tunnel input reordering */ + bool w_seq_set; /* true after first seq received */ + u64 w_wantseq; /* expected next sequence */ + struct skb_wseq *w_saved; /* the saved buf array */ + u32 w_savedlen; /* the saved len (not size) */ + spinlock_t drop_lock; + struct hrtimer drop_timer; + u64 drop_time_ns; + + /* Tunnel input reassembly */ + struct sk_buff *ra_newskb; /* new pkt being reassembled */ + u64 ra_wantseq; /* expected next sequence */ + u8 ra_runt[6]; /* last pkt bytes from last skb */ + u8 ra_runtlen; /* count of ra_runt */ +}; + +static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu); +static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me); +static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me); + +/* ================= */ +/* Utility Functions */ +/* ================= */ + +#ifdef TRACEPOINTS_ENABLED +static u32 __trace_ip_proto(struct iphdr *iph) +{ + if (iph->version == 4) + return iph->protocol; + return ((struct ipv6hdr *)iph)->nexthdr; +} + +static u32 __trace_ip_proto_seq(struct iphdr *iph) +{ + void *nexthdr; + u32 protocol = 0; + + if (iph->version == 4) { + nexthdr = (void *)(iph + 1); + protocol = iph->protocol; + } else if (iph->version == 6) { + nexthdr = (void *)(((struct ipv6hdr *)(iph)) + 1); + protocol = ((struct ipv6hdr *)(iph))->nexthdr; + } + switch (protocol) { + case IPPROTO_ICMP: + return ntohs(((struct icmphdr *)nexthdr)->un.echo.sequence); + case IPPROTO_ICMPV6: + return ntohs(((struct icmp6hdr *)nexthdr)->icmp6_sequence); + case IPPROTO_TCP: + return ntohl(((struct tcphdr *)nexthdr)->seq); + case IPPROTO_UDP: + return ntohs(((struct udphdr *)nexthdr)->source); + default: + return 0; + } +} +#endif /*TRACEPOINTS_ENABLED*/ + +static u64 __esp_seq(struct sk_buff *skb) +{ + u64 seq = ntohl(XFRM_SKB_CB(skb)->seq.input.low); + + return seq | (u64)ntohl(XFRM_SKB_CB(skb)->seq.input.hi) << 32; +} + +/* ======================= */ +/* IPTFS SK_BUFF Functions */ +/* ======================= */ + +/** + * iptfs_alloc_skb() - Allocate a new `skb`. + * @tpl: the skb to copy required meta-data from. + * @len: the linear length of the head data, zero is fine. + * @l3resv: true if skb reserve needs to support pushing L3 headers + * + * A new `skb` is allocated and required meta-data is copied from `tpl`, the + * head data is sized to `len` + reserved space set according to the @l3resv + * boolean. + * + * When @l3resv is false, resv is XFRM_IPTFS_MIN_L2HEADROOM which arranges for + * `skb->data - 16` which is a good guess for good cache alignment (placing the + * to be pushed L2 header at the start of a cacheline. + * + * Otherwise, @l3resv is true and resv is set to the correct reserved space for + * dst->dev plus the calculated L3 overhead for the xfrm dst or + * XFRM_IPTFS_MIN_L3HEADROOM whichever is larger. This is then cache aligned so + * that all the headers will commonly fall in a cacheline when possible. + * + * l3resv=true is used on tunnel ingress (tx), because we need to reserve for + * the new IPTFS packet (i.e., L2+L3 headers). On tunnel egress (rx) the data + * being copied into the skb includes the user L3 headers already so we only + * need to reserve for L2. + * + * Return: the new skb or NULL. + */ +static struct sk_buff *iptfs_alloc_skb(struct sk_buff *tpl, u32 len, bool l3resv) +{ + struct sk_buff *skb; + u32 resv; + + if (!l3resv) { + resv = XFRM_IPTFS_MIN_L2HEADROOM; + } else { + struct dst_entry *dst = skb_dst(tpl); + + resv = LL_RESERVED_SPACE(dst->dev) + dst->header_len; + resv = max(resv, XFRM_IPTFS_MIN_L3HEADROOM); + resv = L1_CACHE_ALIGN(resv); + } + + skb = alloc_skb(len + resv, GFP_ATOMIC | __GFP_NOWARN); + if (!skb) + return NULL; + + skb_reserve(skb, resv); + + if (!l3resv) { + /* xfrm_input resume needs dev and xfrm ext from tunnel pkt */ + skb->dev = tpl->dev; + __skb_ext_copy(skb, tpl); + } + + /* dropped by xfrm_input, used by xfrm_output */ + skb_dst_copy(skb, tpl); + + return skb; +} + +/** + * iptfs_skb_head_to_frag() - initialize a skb_frag_t based on skb head data + * @skb: skb with the head data + * @frag: frag to initialize + */ +static void iptfs_skb_head_to_frag(const struct sk_buff *skb, skb_frag_t *frag) +{ + struct page *page = virt_to_head_page(skb->data); + unsigned char *addr = (unsigned char *)page_address(page); + + skb_frag_fill_page_desc(frag, page, skb->data - addr, skb_headlen(skb)); +} + +/** + * struct iptfs_skb_frag_walk - use to track a walk through fragments + * @fragi: current fragment index + * @past: length of data in fragments before @fragi + * @total: length of data in all fragments + * @nr_frags: number of fragments present in array + * @initial_offset: the value passed in to skb_prepare_frag_walk() + * @frags: the page fragments inc. room for head page + * @pp_recycle: copy of skb->pp_recycle + */ +struct iptfs_skb_frag_walk { + u32 fragi; + u32 past; + u32 total; + u32 nr_frags; + u32 initial_offset; + skb_frag_t frags[MAX_SKB_FRAGS + 1]; + bool pp_recycle; +}; + +/** + * iptfs_skb_prepare_frag_walk() - initialize a frag walk over an skb. + * @skb: the skb to walk. + * @initial_offset: start the walk @initial_offset into the skb. + * @walk: the walk to initialize + * + * Future calls to skb_add_frags() will expect the @offset value to be at + * least @initial_offset large. + */ +static void iptfs_skb_prepare_frag_walk(struct sk_buff *skb, u32 initial_offset, + struct iptfs_skb_frag_walk *walk) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + skb_frag_t *frag, *from; + u32 i; + + walk->initial_offset = initial_offset; + walk->fragi = 0; + walk->past = 0; + walk->total = 0; + walk->nr_frags = 0; + walk->pp_recycle = skb->pp_recycle; + + if (skb->head_frag) { + if (initial_offset >= skb_headlen(skb)) { + initial_offset -= skb_headlen(skb); + } else { + frag = &walk->frags[walk->nr_frags++]; + iptfs_skb_head_to_frag(skb, frag); + frag->offset += initial_offset; + frag->len -= initial_offset; + walk->total += frag->len; + initial_offset = 0; + } + } else { + initial_offset -= skb_headlen(skb); + } + + for (i = 0; i < shinfo->nr_frags; i++) { + from = &shinfo->frags[i]; + if (initial_offset >= from->len) { + initial_offset -= from->len; + continue; + } + frag = &walk->frags[walk->nr_frags++]; + *frag = *from; + if (initial_offset) { + frag->offset += initial_offset; + frag->len -= initial_offset; + initial_offset = 0; + } + walk->total += frag->len; + } +} + +static u32 iptfs_skb_reset_frag_walk(struct iptfs_skb_frag_walk *walk, + u32 offset) +{ + /* Adjust offset to refer to internal walk values */ + offset -= walk->initial_offset; + + /* Get to the correct fragment for offset */ + while (offset < walk->past) { + walk->past -= walk->frags[--walk->fragi].len; + if (offset >= walk->past) + break; + } + while (offset >= walk->past + walk->frags[walk->fragi].len) + walk->past += walk->frags[walk->fragi++].len; + + /* offset now relative to this current frag */ + offset -= walk->past; + return offset; +} + +/** + * iptfs_skb_can_add_frags() - check if ok to add frags from walk to skb + * @skb: skb to check for adding frags to + * @walk: the walk that will be used as source for frags. + * @offset: offset from beginning of original skb to start from. + * @len: amount of data to add frag references to in @skb. + * + * Return: true if ok to add frags. + */ +static bool iptfs_skb_can_add_frags(const struct sk_buff *skb, + struct iptfs_skb_frag_walk *walk, + u32 offset, u32 len) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + u32 fragi, nr_frags, fraglen; + + if (skb_has_frag_list(skb) || skb->pp_recycle != walk->pp_recycle) + return false; + + /* Make offset relative to current frag after setting that */ + offset = iptfs_skb_reset_frag_walk(walk, offset); + + /* Verify we have array space for the fragments we need to add */ + fragi = walk->fragi; + nr_frags = shinfo->nr_frags; + while (len && fragi < walk->nr_frags) { + skb_frag_t *frag = &walk->frags[fragi]; + + fraglen = frag->len; + if (offset) { + fraglen -= offset; + offset = 0; + } + if (++nr_frags > MAX_SKB_FRAGS) + return false; + if (len <= fraglen) + return true; + len -= fraglen; + fragi++; + } + /* We may not copy all @len but what we have will fit. */ + return true; +} + +/** + * iptfs_skb_add_frags() - add a range of fragment references into an skb + * @skb: skb to add references into + * @walk: the walk to add referenced fragments from. + * @offset: offset from beginning of original skb to start from. + * @len: amount of data to add frag references to in @skb. + * + * iptfs_skb_can_add_frags() should be called before this function to verify + * that the destination @skb is compatible with the walk and has space in the + * array for the to be added frag references. + * + * Return: The number of bytes not added to @skb b/c we reached the end of the + * walk before adding all of @len. + */ +static int iptfs_skb_add_frags(struct sk_buff *skb, + struct iptfs_skb_frag_walk *walk, u32 offset, + u32 len) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + u32 fraglen; + + if (!walk->nr_frags || offset >= walk->total + walk->initial_offset) + return len; + + /* make offset relative to current frag after setting that */ + offset = iptfs_skb_reset_frag_walk(walk, offset); + + while (len && walk->fragi < walk->nr_frags) { + skb_frag_t *frag = &walk->frags[walk->fragi]; + skb_frag_t *tofrag = &shinfo->frags[shinfo->nr_frags]; + + *tofrag = *frag; + if (offset) { + tofrag->offset += offset; + tofrag->len -= offset; + offset = 0; + } + __skb_frag_ref(tofrag); + shinfo->nr_frags++; + + /* see if we are done */ + fraglen = tofrag->len; + if (len < fraglen) { + tofrag->len = len; + skb->len += len; + skb->data_len += len; + return 0; + } + /* advance to next source fragment */ + len -= fraglen; /* careful, use dst bv_len */ + skb->len += fraglen; /* careful, " " " */ + skb->data_len += fraglen; /* careful, " " " */ + walk->past += frag->len; /* careful, use src bv_len */ + walk->fragi++; + } + return len; +} + +/* ================================== */ +/* IPTFS Trace Event Definitions */ +/* ================================== */ + +#define CREATE_TRACE_POINTS +#include "trace_iptfs.h" + +/* ================================== */ +/* IPTFS Receiving (egress) Functions */ +/* ================================== */ + +/** + * iptfs_pskb_add_frags() - Create and add frags into a new sk_buff. + * @tpl: template to create new skb from. + * @walk: The source for fragments to add. + * @off: The offset into @walk to add frags from, also used with @st and + * @copy_len. + * @len: The length of data to add covering frags from @walk into @skb. + * This must be <= @skblen. + * @st: The sequence state to copy from into the new head skb. + * @copy_len: Copy @copy_len bytes from @st at offset @off into the new skb + * linear space. + * + * Create a new sk_buff `skb` using the template @tpl. Copy @copy_len bytes from + * @st into the new skb linear space, and then add shared fragments from the + * frag walk for the remaining @len of data (i.e., @len - @copy_len bytes). + * + * Return: The newly allocated sk_buff `skb` or NULL if an error occurs. + */ +static struct sk_buff * +iptfs_pskb_add_frags(struct sk_buff *tpl, struct iptfs_skb_frag_walk *walk, + u32 off, u32 len, struct skb_seq_state *st, u32 copy_len) +{ + struct sk_buff *skb; + + skb = iptfs_alloc_skb(tpl, copy_len, false); + if (!skb) + return NULL; + + /* this should not normally be happening */ + if (!iptfs_skb_can_add_frags(skb, walk, off + copy_len, + len - copy_len)) { + kfree_skb(skb); + return NULL; + } + + if (copy_len && + skb_copy_seq_read(st, off, skb_put(skb, copy_len), copy_len)) { + XFRM_INC_STATS(dev_net(st->root_skb->dev), + LINUX_MIB_XFRMINERROR); + kfree_skb(skb); + return NULL; + } + + iptfs_skb_add_frags(skb, walk, off + copy_len, len - copy_len); + return skb; +} + +/** + * iptfs_pskb_extract_seq() - Create and load data into a new sk_buff. + * @skblen: the total data size for `skb`. + * @st: The source for the rest of the data to copy into `skb`. + * @off: The offset into @st to copy data from. + * @len: The length of data to copy from @st into `skb`. This must be <= + * @skblen. + * + * Create a new sk_buff `skb` with @skblen of packet data space. If non-zero, + * copy @rlen bytes of @runt into `skb`. Then using seq functions copy @len + * bytes from @st into `skb` starting from @off. + * + * It is an error for @len to be greater than the amount of data left in @st. + * + * Return: The newly allocated sk_buff `skb` or NULL if an error occurs. + */ +static struct sk_buff * +iptfs_pskb_extract_seq(u32 skblen, struct skb_seq_state *st, u32 off, int len) +{ + struct sk_buff *skb = iptfs_alloc_skb(st->root_skb, skblen, false); + + if (!skb) + return NULL; + if (skb_copy_seq_read(st, off, skb_put(skb, len), len)) { + XFRM_INC_STATS(dev_net(st->root_skb->dev), LINUX_MIB_XFRMINERROR); + kfree_skb(skb); + return NULL; + } + return skb; +} + +/** + * iptfs_input_save_runt() - save data in xtfs runt space. + * @xtfs: xtfs state + * @seq: the current sequence + * @buf: packet data + * @len: length of packet data + * + * Save the small (`len`) start of a fragmented packet in `buf` in the xtfs data + * runt space. + */ +static void iptfs_input_save_runt(struct xfrm_iptfs_data *xtfs, u64 seq, + u8 *buf, int len) +{ + memcpy(xtfs->ra_runt, buf, len); + + xtfs->ra_runtlen = len; + xtfs->ra_wantseq = seq + 1; +} + +/** + * __iptfs_iphlen() - return the v4/v6 header length using packet data. + * @data: pointer at octet with version nibble + * + * The version data has been checked to be valid (i.e., either 4 or 6). + * + * Return: the IP header size based on the IP version. + */ +static u32 __iptfs_iphlen(u8 *data) +{ + struct iphdr *iph = (struct iphdr *)data; + + if (iph->version == 0x4) + return sizeof(*iph); + return sizeof(struct ipv6hdr); +} + +/** + * __iptfs_iplen() - return the v4/v6 length using packet data. + * @data: pointer to ip (v4/v6) packet header + * + * Grab the IPv4 or IPv6 length value in the start of the inner packet header + * pointed to by `data`. Assumes data len is enough for the length field only. + * + * The version data has been checked to be valid (i.e., either 4 or 6). + * + * Return: the length value. + */ +static u32 __iptfs_iplen(u8 *data) +{ + struct iphdr *iph = (struct iphdr *)data; + + if (iph->version == 0x4) + return ntohs(iph->tot_len); + return ntohs(((struct ipv6hdr *)iph)->payload_len) + + sizeof(struct ipv6hdr); +} + +/** + * iptfs_complete_inner_skb() - finish preparing the inner packet for gro recv. + * @x: xfrm state + * @skb: the inner packet + * + * Finish the standard xfrm processing on the inner packet prior to sending back + * through gro_cells_receive. We do this separately b/c we are building a list + * of packets in the hopes that one day a list will be taken by + * xfrm_input. + */ +static void iptfs_complete_inner_skb(struct xfrm_state *x, struct sk_buff *skb) +{ + skb_reset_network_header(skb); + + /* The packet is going back through gro_cells_receive no need to + * set this. + */ + skb_reset_transport_header(skb); + + /* Packet already has checksum value set. */ + skb->ip_summed = CHECKSUM_NONE; + + /* Our skb will contain the header data copied when this outer packet + * which contained the start of this inner packet. This is true + * when we allocate a new skb as well as when we reuse the existing skb. + */ + if (ip_hdr(skb)->version == 0x4) { + struct iphdr *iph = ip_hdr(skb); + + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, iph); + if (!(x->props.flags & XFRM_STATE_NOECN)) + if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) + IP_ECN_set_ce(iph); + + skb->protocol = htons(ETH_P_IP); + } else { + struct ipv6hdr *iph = ipv6_hdr(skb); + + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv6_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, iph); + if (!(x->props.flags & XFRM_STATE_NOECN)) + if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) + IP6_ECN_set_ce(skb, iph); + + skb->protocol = htons(ETH_P_IPV6); + } +} + +static void __iptfs_reassem_done(struct xfrm_iptfs_data *xtfs, bool free) +{ + assert_spin_locked(&xtfs->drop_lock); + + /* We don't care if it works locking takes care of things */ + hrtimer_try_to_cancel(&xtfs->drop_timer); + if (free) + kfree_skb(xtfs->ra_newskb); + xtfs->ra_newskb = NULL; +} + +/** + * iptfs_reassem_abort() - In-progress packet is aborted free the state. + * @xtfs: xtfs state + */ +static void iptfs_reassem_abort(struct xfrm_iptfs_data *xtfs) +{ + __iptfs_reassem_done(xtfs, true); +} + +/** + * iptfs_reassem_done() - In-progress packet is complete, clear the state. + * @xtfs: xtfs state + */ +static void iptfs_reassem_done(struct xfrm_iptfs_data *xtfs) +{ + __iptfs_reassem_done(xtfs, false); +} + +/** + * iptfs_reassem_cont() - Continue the reassembly of an inner packets. + * @xtfs: xtfs state + * @seq: sequence of current packet + * @st: seq read stat for current packet + * @skb: current packet + * @data: offset into sequential packet data + * @blkoff: packet blkoff value + * @list: list of skbs to enqueue completed packet on + * + * Process an IPTFS payload that has a non-zero `blkoff` or when we are + * expecting the continuation b/c we have a runt or in-progress packet. + * + * Return: the new data offset to continue processing from. + */ +static u32 iptfs_reassem_cont(struct xfrm_iptfs_data *xtfs, u64 seq, + struct skb_seq_state *st, struct sk_buff *skb, + u32 data, u32 blkoff, struct list_head *list) +{ + struct iptfs_skb_frag_walk _fragwalk; + struct iptfs_skb_frag_walk *fragwalk = NULL; + struct sk_buff *newskb = xtfs->ra_newskb; + u32 remaining = skb->len - data; + u32 runtlen = xtfs->ra_runtlen; + u32 copylen, fraglen, ipremain, iphlen, iphremain, rrem; + + /* Handle packet fragment we aren't expecting */ + if (!runtlen && !xtfs->ra_newskb) + return data + min(blkoff, remaining); + + /* Important to remember that input to this function is an ordered + * packet stream (unless the user disabled the reorder window). Thus if + * we are waiting for, and expecting the next packet so we can continue + * assembly, a newer sequence number indicates older ones are not coming + * (or if they do should be ignored). Technically we can receive older + * ones when the reorder window is disabled; however, the user should + * have disabled fragmentation in this case, and regardless we don't + * deal with it. + * + * blkoff could be zero if the stream is messed up (or it's an all pad + * insertion) be careful to handle that case in each of the below + */ + + /* Too old case: This can happen when the reorder window is disabled so + * ordering isn't actually guaranteed. + */ + if (seq < xtfs->ra_wantseq) + return data + remaining; + + /* Too new case: We missed what we wanted cleanup. */ + if (seq > xtfs->ra_wantseq) { + XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + + if (blkoff == 0) { + if ((*skb->data & 0xF0) != 0) { + XFRM_INC_STATS(xs_net(xtfs->x), + LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + /* Handle all pad case, advance expected sequence number. + * (RFC 9347 S2.2.3) + */ + xtfs->ra_wantseq++; + /* will end parsing */ + return data + remaining; + } + + if (runtlen) { + /* Regardless of what happens we're done with the runt */ + xtfs->ra_runtlen = 0; + + /* The start of this inner packet was at the very end of the last + * iptfs payload which didn't include enough for the ip header + * length field. We must have *at least* that now. + */ + rrem = sizeof(xtfs->ra_runt) - runtlen; + if (remaining < rrem || blkoff < rrem) { + XFRM_INC_STATS(xs_net(xtfs->x), + LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + + /* fill in the runt data */ + if (skb_copy_seq_read(st, data, &xtfs->ra_runt[runtlen], + rrem)) { + XFRM_INC_STATS(xs_net(xtfs->x), + LINUX_MIB_XFRMINBUFFERERROR); + goto abandon; + } + + /* We have enough data to get the ip length value now, + * allocate an in progress skb + */ + ipremain = __iptfs_iplen(xtfs->ra_runt); + if (ipremain < sizeof(xtfs->ra_runt)) { + /* length has to be at least runtsize large */ + XFRM_INC_STATS(xs_net(xtfs->x), + LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + + /* For the runt case we don't attempt sharing currently. NOTE: + * Currently, this IPTFS implementation will not create runts. + */ + + newskb = iptfs_alloc_skb(skb, ipremain, false); + if (!newskb) { + XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMINERROR); + goto abandon; + } + xtfs->ra_newskb = newskb; + + /* Copy the runt data into the buffer, but leave data + * pointers the same as normal non-runt case. The extra `rrem` + * recopied bytes are basically cacheline free. Allows using + * same logic below to complete. + */ + memcpy(skb_put(newskb, runtlen), xtfs->ra_runt, + sizeof(xtfs->ra_runt)); + } + + /* Continue reassembling the packet */ + ipremain = __iptfs_iplen(newskb->data); + iphlen = __iptfs_iphlen(newskb->data); + + ipremain -= newskb->len; + if (blkoff < ipremain) { + /* Corrupt data, we don't have enough to complete the packet */ + XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + + /* We want the IP header in linear space */ + if (newskb->len < iphlen) { + iphremain = iphlen - newskb->len; + if (blkoff < iphremain) { + XFRM_INC_STATS(xs_net(xtfs->x), + LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + fraglen = min(blkoff, remaining); + copylen = min(fraglen, iphremain); + if (skb_copy_seq_read(st, data, skb_put(newskb, copylen), + copylen)) { + XFRM_INC_STATS(xs_net(xtfs->x), + LINUX_MIB_XFRMINBUFFERERROR); + goto abandon; + } + /* this is a silly condition that might occur anyway */ + if (copylen < iphremain) { + xtfs->ra_wantseq++; + return data + fraglen; + } + /* update data and things derived from it */ + data += copylen; + blkoff -= copylen; + remaining -= copylen; + ipremain -= copylen; + } + + fraglen = min(blkoff, remaining); + copylen = min(fraglen, ipremain); + + /* If we may have the opportunity to share prepare a fragwalk. */ + if (!skb_has_frag_list(skb) && !skb_has_frag_list(newskb) && + (skb->head_frag || skb->len == skb->data_len) && + skb->pp_recycle == newskb->pp_recycle) { + fragwalk = &_fragwalk; + iptfs_skb_prepare_frag_walk(skb, data, fragwalk); + } + + /* Try share then copy. */ + if (fragwalk && + iptfs_skb_can_add_frags(newskb, fragwalk, data, copylen)) { + iptfs_skb_add_frags(newskb, fragwalk, data, copylen); + } else { + /* copy fragment data into newskb */ + if (skb_copy_seq_read(st, data, skb_put(newskb, copylen), + copylen)) { + XFRM_INC_STATS(xs_net(xtfs->x), + LINUX_MIB_XFRMINBUFFERERROR); + goto abandon; + } + } + + if (copylen < ipremain) { + xtfs->ra_wantseq++; + } else { + /* We are done with packet reassembly! */ + iptfs_reassem_done(xtfs); + iptfs_complete_inner_skb(xtfs->x, newskb); + list_add_tail(&newskb->list, list); + } + + /* will continue on to new data block or end */ + return data + fraglen; + +abandon: + if (xtfs->ra_newskb) { + iptfs_reassem_abort(xtfs); + } else { + xtfs->ra_runtlen = 0; + xtfs->ra_wantseq = 0; + } + /* skip past fragment, maybe to end */ + return data + min(blkoff, remaining); +} + +static bool __input_process_payload(struct xfrm_state *x, u32 data, + struct skb_seq_state *skbseq, + struct list_head *sublist) +{ + u8 hbytes[sizeof(struct ipv6hdr)]; + struct iptfs_skb_frag_walk _fragwalk; + struct iptfs_skb_frag_walk *fragwalk = NULL; + struct sk_buff *defer, *first_skb, *next, *skb; + const unsigned char *old_mac; + struct xfrm_iptfs_data *xtfs; + struct iphdr *iph; + struct net *net; + u32 first_iplen, iphlen, iplen, remaining, tail; + u32 capturelen; + u64 seq; + + xtfs = x->mode_data; + net = xs_net(x); + skb = skbseq->root_skb; + first_skb = NULL; + defer = NULL; + + seq = __esp_seq(skb); + + /* Save the old mac header if set */ + old_mac = skb_mac_header_was_set(skb) ? skb_mac_header(skb) : NULL; + + /* New packets */ + + tail = skb->len; + while (data < tail) { + __be16 protocol = 0; + + /* Gather information on the next data block. + * `data` points to the start of the data block. + */ + remaining = tail - data; + + /* try and copy enough bytes to read length from ipv4/ipv6 */ + iphlen = min_t(u32, remaining, 6); + if (skb_copy_seq_read(skbseq, data, hbytes, iphlen)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); + goto done; + } + + iph = (struct iphdr *)hbytes; + if (iph->version == 0x4) { + /* must have at least tot_len field present */ + if (remaining < 4) { + /* save the bytes we have, advance data and exit */ + iptfs_input_save_runt(xtfs, seq, hbytes, + remaining); + data += remaining; + break; + } + + iplen = be16_to_cpu(iph->tot_len); + iphlen = iph->ihl << 2; + protocol = cpu_to_be16(ETH_P_IP); + XFRM_MODE_SKB_CB(skbseq->root_skb)->tos = iph->tos; + } else if (iph->version == 0x6) { + /* must have at least payload_len field present */ + if (remaining < 6) { + /* save the bytes we have, advance data and exit */ + iptfs_input_save_runt(xtfs, seq, hbytes, + remaining); + data += remaining; + break; + } + + iplen = be16_to_cpu(((struct ipv6hdr *)hbytes)->payload_len); + iplen += sizeof(struct ipv6hdr); + iphlen = sizeof(struct ipv6hdr); + protocol = cpu_to_be16(ETH_P_IPV6); + XFRM_MODE_SKB_CB(skbseq->root_skb)->tos = + ipv6_get_dsfield((struct ipv6hdr *)iph); + } else if (iph->version == 0x0) { + /* pad */ + data = tail; + break; + } else { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); + goto done; + } + + if (unlikely(skbseq->stepped_offset)) { + /* We need to reset our seq read, it can't backup at + * this point. + */ + struct sk_buff *save = skbseq->root_skb; + + skb_abort_seq_read(skbseq); + skb_prepare_seq_read(save, data, tail, skbseq); + } + + if (first_skb) { + skb = NULL; + } else { + first_skb = skb; + first_iplen = iplen; + fragwalk = NULL; + + /* We are going to skip over `data` bytes to reach the + * start of the IP header of `iphlen` len for `iplen` + * inner packet. + */ + + if (skb_has_frag_list(skb)) { + defer = skb; + skb = NULL; + } else if (data + iphlen <= skb_headlen(skb) && + /* make sure our header is 32-bit aligned? */ + /* ((uintptr_t)(skb->data + data) & 0x3) == 0 && */ + skb_tailroom(skb) + tail - data >= iplen) { + /* Reuse the received skb. + * + * We have enough headlen to pull past any + * initial fragment data, leaving at least the + * IP header in the linear buffer space. + * + * For linear buffer space we only require that + * linear buffer space is large enough to + * eventually hold the entire reassembled + * packet (by including tailroom in the check). + * + * For non-linear tailroom is 0 and so we only + * re-use if the entire packet is present + * already. + * + * NOTE: there are many more options for + * sharing, KISS for now. Also, this can produce + * skb's with the IP header unaligned to 32 + * bits. If that ends up being a problem then a + * check should be added to the conditional + * above that the header lies on a 32-bit + * boundary as well. + */ + skb_pull(skb, data); + + /* our range just changed */ + data = 0; + tail = skb->len; + remaining = skb->len; + + skb->protocol = protocol; + skb_mac_header_rebuild(skb); + if (skb->mac_len) + eth_hdr(skb)->h_proto = skb->protocol; + + /* all pointers could be changed now reset walk */ + skb_abort_seq_read(skbseq); + skb_prepare_seq_read(skb, data, tail, skbseq); + } else if (skb->head_frag && + /* We have the IP header right now */ + remaining >= iphlen) { + fragwalk = &_fragwalk; + iptfs_skb_prepare_frag_walk(skb, data, fragwalk); + defer = skb; + skb = NULL; + } else { + /* We couldn't reuse the input skb so allocate a + * new one. + */ + defer = skb; + skb = NULL; + } + + /* Don't trim `first_skb` until the end as we are + * walking that data now. + */ + } + + capturelen = min(iplen, remaining); + if (!skb) { + if (!fragwalk || + /* Large enough to be worth sharing */ + iplen < IPTFS_PKT_SHARE_MIN || + /* Have IP header + some data to share. */ + capturelen <= iphlen || + /* Try creating skb and adding frags */ + !(skb = iptfs_pskb_add_frags(first_skb, fragwalk, + data, capturelen, + skbseq, iphlen))) { + skb = iptfs_pskb_extract_seq(iplen, skbseq, data, capturelen); + } + if (!skb) { + /* skip to next packet or done */ + data += capturelen; + continue; + } + + skb->protocol = protocol; + if (old_mac) { + /* rebuild the mac header */ + skb_set_mac_header(skb, -first_skb->mac_len); + memcpy(skb_mac_header(skb), old_mac, first_skb->mac_len); + eth_hdr(skb)->h_proto = skb->protocol; + } + } + + data += capturelen; + + if (skb->len < iplen) { + /* Start reassembly */ + spin_lock(&xtfs->drop_lock); + + xtfs->ra_newskb = skb; + xtfs->ra_wantseq = seq + 1; + if (!hrtimer_is_queued(&xtfs->drop_timer)) { + /* softirq blocked lest the timer fire and interrupt us */ + hrtimer_start(&xtfs->drop_timer, + xtfs->drop_time_ns, + IPTFS_HRTIMER_MODE); + } + + spin_unlock(&xtfs->drop_lock); + + break; + } + + iptfs_complete_inner_skb(x, skb); + list_add_tail(&skb->list, sublist); + } + + if (data != tail) + /* this should not happen from the above code */ + XFRM_INC_STATS(net, LINUX_MIB_XFRMINIPTFSERROR); + + if (first_skb && first_iplen && !defer && first_skb != xtfs->ra_newskb) { + /* first_skb is queued b/c !defer and not partial */ + if (pskb_trim(first_skb, first_iplen)) { + /* error trimming */ + list_del(&first_skb->list); + defer = first_skb; + } + first_skb->ip_summed = CHECKSUM_NONE; + } + + /* Send the packets! */ + list_for_each_entry_safe(skb, next, sublist, list) { + skb_list_del_init(skb); + if (xfrm_input(skb, 0, 0, -2)) + kfree_skb(skb); + } +done: + skb = skbseq->root_skb; + skb_abort_seq_read(skbseq); + + if (defer) { + consume_skb(defer); + } else if (!first_skb) { + /* skb is the original passed in skb, but we didn't get far + * enough to process it as the first_skb, if we had it would + * either be save in ra_newskb, trimmed and sent on as an skb or + * placed in defer to be freed. + */ + kfree_skb(skb); + } + return true; +} + +/** + * iptfs_input_ordered() - handle next in order IPTFS payload. + * @x: xfrm state + * @skb: current packet + * + * Process the IPTFS payload in `skb` and consume it afterwards. + */ +static void iptfs_input_ordered(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ip_iptfs_cc_hdr iptcch; + struct skb_seq_state skbseq; + struct list_head sublist; /* rename this it's just a list */ + struct xfrm_iptfs_data *xtfs; + struct ip_iptfs_hdr *ipth; + struct net *net; + u32 blkoff, data, remaining; + bool consumed = false; + u64 seq; + + xtfs = x->mode_data; + net = xs_net(x); + + seq = __esp_seq(skb); + + /* Large enough to hold both types of header */ + ipth = (struct ip_iptfs_hdr *)&iptcch; + + skb_prepare_seq_read(skb, 0, skb->len, &skbseq); + + /* Get the IPTFS header and validate it */ + + if (skb_copy_seq_read(&skbseq, 0, ipth, sizeof(*ipth))) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); + goto done; + } + data = sizeof(*ipth); + + trace_iptfs_egress_recv(skb, xtfs, be16_to_cpu(ipth->block_offset)); + + /* Set data past the basic header */ + if (ipth->subtype == IPTFS_SUBTYPE_CC) { + /* Copy the rest of the CC header */ + remaining = sizeof(iptcch) - sizeof(*ipth); + if (skb_copy_seq_read(&skbseq, data, ipth + 1, remaining)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); + goto done; + } + data += remaining; + } else if (ipth->subtype != IPTFS_SUBTYPE_BASIC) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); + goto done; + } + + if (ipth->flags != 0) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); + goto done; + } + + INIT_LIST_HEAD(&sublist); + + /* Handle fragment at start of payload, and/or waiting reassembly. */ + + blkoff = ntohs(ipth->block_offset); + /* check before locking i.e., maybe */ + if (blkoff || xtfs->ra_runtlen || xtfs->ra_newskb) { + spin_lock(&xtfs->drop_lock); + + /* check again after lock */ + if (blkoff || xtfs->ra_runtlen || xtfs->ra_newskb) { + data = iptfs_reassem_cont(xtfs, seq, &skbseq, skb, data, + blkoff, &sublist); + } + + spin_unlock(&xtfs->drop_lock); + } + + /* New packets */ + consumed = __input_process_payload(x, data, &skbseq, &sublist); +done: + if (!consumed) { + skb = skbseq.root_skb; + skb_abort_seq_read(&skbseq); + kfree_skb(skb); + } +} + +/* ------------------------------- */ +/* Input (Egress) Re-ordering Code */ +/* ------------------------------- */ + +static void __vec_shift(struct xfrm_iptfs_data *xtfs, u32 shift) +{ + u32 savedlen = xtfs->w_savedlen; + + if (shift > savedlen) + shift = savedlen; + if (shift != savedlen) + memcpy(xtfs->w_saved, xtfs->w_saved + shift, + (savedlen - shift) * sizeof(*xtfs->w_saved)); + memset(xtfs->w_saved + savedlen - shift, 0, + shift * sizeof(*xtfs->w_saved)); + xtfs->w_savedlen -= shift; +} + +static void __reorder_past(struct xfrm_iptfs_data *xtfs, struct sk_buff *inskb, + struct list_head *freelist) +{ + list_add_tail(&inskb->list, freelist); +} + +static u32 __reorder_drop(struct xfrm_iptfs_data *xtfs, struct list_head *list) + +{ + struct skb_wseq *s, *se; + const u32 savedlen = xtfs->w_savedlen; + time64_t now = ktime_get_raw_fast_ns(); + u32 count = 0; + u32 scount = 0; + + if (xtfs->w_saved[0].drop_time > now) + goto set_timer; + + ++xtfs->w_wantseq; + + /* Keep flushing packets until we reach a drop time greater than now. */ + s = xtfs->w_saved; + se = s + savedlen; + do { + /* Walking past empty slots until we reach a packet */ + for (; s < se && !s->skb; s++) { + if (s->drop_time > now) + goto outerdone; + } + /* Sending packets until we hit another empty slot. */ + for (; s < se && s->skb; scount++, s++) + list_add_tail(&s->skb->list, list); + } while (s < se); +outerdone: + + count = s - xtfs->w_saved; + if (count) { + xtfs->w_wantseq += count; + + /* Shift handled slots plus final empty slot into slot 0. */ + __vec_shift(xtfs, count); + } + + if (xtfs->w_savedlen) { +set_timer: + /* Drifting is OK */ + hrtimer_start(&xtfs->drop_timer, + xtfs->w_saved[0].drop_time - now, + IPTFS_HRTIMER_MODE); + } + return scount; +} + +static void __reorder_this(struct xfrm_iptfs_data *xtfs, struct sk_buff *inskb, + struct list_head *list) +{ + struct skb_wseq *s, *se; + const u32 savedlen = xtfs->w_savedlen; + u32 count = 0; + + /* Got what we wanted. */ + list_add_tail(&inskb->list, list); + ++xtfs->w_wantseq; + if (!savedlen) + return; + + /* Flush remaining consecutive packets. */ + + /* Keep sending until we hit another missed pkt. */ + for (s = xtfs->w_saved, se = s + savedlen; s < se && s->skb; s++) + list_add_tail(&s->skb->list, list); + count = s - xtfs->w_saved; + if (count) + xtfs->w_wantseq += count; + + /* Shift handled slots plus final empty slot into slot 0. */ + __vec_shift(xtfs, count + 1); +} + +/* Set the slot's drop time and all the empty slots below it until reaching a + * filled slot which will already be set. + */ +static void iptfs_set_window_drop_times(struct xfrm_iptfs_data *xtfs, int index) +{ + const u32 savedlen = xtfs->w_savedlen; + struct skb_wseq *s = xtfs->w_saved; + time64_t drop_time; + + assert_spin_locked(&xtfs->drop_lock); + + if (savedlen > index + 1) { + /* we are below another, our drop time and the timer are already set */ + return; + } + /* we are the most future so get a new drop time. */ + drop_time = ktime_get_raw_fast_ns(); + drop_time += xtfs->drop_time_ns; + + /* Walk back through the array setting drop times as we go */ + s[index].drop_time = drop_time; + while (index-- > 0 && !s[index].skb) + s[index].drop_time = drop_time; + + /* If we walked all the way back, schedule the drop timer if needed */ + if (index == -1 && !hrtimer_is_queued(&xtfs->drop_timer)) + hrtimer_start(&xtfs->drop_timer, xtfs->drop_time_ns, + IPTFS_HRTIMER_MODE); +} + +static void __reorder_future_fits(struct xfrm_iptfs_data *xtfs, + struct sk_buff *inskb, + struct list_head *freelist) +{ + const u64 inseq = __esp_seq(inskb); + const u64 wantseq = xtfs->w_wantseq; + const u64 distance = inseq - wantseq; + const u32 savedlen = xtfs->w_savedlen; + const u32 index = distance - 1; + + /* Handle future sequence number received which fits in the window. + * + * We know we don't have the seq we want so we won't be able to flush + * anything. + */ + + /* slot count is 4, saved size is 3 savedlen is 2 + * + * "window boundary" is based on the fixed window size + * distance is also slot number + * index is an array index (i.e., - 1 of slot) + * : : - implicit NULL after array len + * + * +--------- used length (savedlen == 2) + * | +----- array size (nslots - 1 == 3) + * | | + window boundary (nslots == 4) + * V V | V + * | + * 0 1 2 3 | slot number + * --- 0 1 2 | array index + * [-] [b] : :| array + * + * "2" "3" "4" *5*| seq numbers + * + * We receive seq number 5 + * distance == 3 [inseq(5) - w_wantseq(2)] + * index == 2 [distance(6) - 1] + */ + + if (xtfs->w_saved[index].skb) { + /* a dup of a future */ + list_add_tail(&inskb->list, freelist); + return; + } + + xtfs->w_saved[index].skb = inskb; + xtfs->w_savedlen = max(savedlen, index + 1); + iptfs_set_window_drop_times(xtfs, index); +} + +static void __reorder_future_shifts(struct xfrm_iptfs_data *xtfs, + struct sk_buff *inskb, + struct list_head *list) +{ + const u32 nslots = xtfs->cfg.reorder_win_size + 1; + const u64 inseq = __esp_seq(inskb); + u32 savedlen = xtfs->w_savedlen; + u64 wantseq = xtfs->w_wantseq; + struct skb_wseq *wnext; + struct sk_buff *slot0; + u32 beyond, shifting, slot; + u64 distance; + + /* Handle future sequence number received. + * + * IMPORTANT: we are at least advancing w_wantseq (i.e., wantseq) by 1 + * b/c we are beyond the window boundary. + * + * We know we don't have the wantseq so that counts as a drop. + */ + + /* example: slot count is 4, array size is 3 savedlen is 2, slot 0 is + * the missing sequence number. + * + * the final slot at savedlen (index savedlen - 1) is always occupied. + * + * beyond is "beyond array size" not savedlen. + * + * +--------- array length (savedlen == 2) + * | +----- array size (nslots - 1 == 3) + * | | +- window boundary (nslots == 4) + * V V | + * | + * 0 1 2 3 | slot number + * --- 0 1 2 | array index + * [b] [c] : :| array + * | + * "2" "3" "4" "5"|*6* seq numbers + * + * We receive seq number 6 + * distance == 4 [inseq(6) - w_wantseq(2)] + * newslot == distance + * index == 3 [distance(4) - 1] + * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))] + * shifting == 1 [min(savedlen(2), beyond(1)] + * slot0_skb == [b], and should match w_wantseq + * + * +--- window boundary (nslots == 4) + * 0 1 2 3 | 4 slot number + * --- 0 1 2 | 3 array index + * [b] : : : :| array + * "2" "3" "4" "5" *6* seq numbers + * + * We receive seq number 6 + * distance == 4 [inseq(6) - w_wantseq(2)] + * newslot == distance + * index == 3 [distance(4) - 1] + * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))] + * shifting == 1 [min(savedlen(1), beyond(1)] + * slot0_skb == [b] and should match w_wantseq + * + * +-- window boundary (nslots == 4) + * 0 1 2 3 | 4 5 6 slot number + * --- 0 1 2 | 3 4 5 array index + * [-] [c] : :| array + * "2" "3" "4" "5" "6" "7" *8* seq numbers + * + * savedlen = 2, beyond = 3 + * iter 1: slot0 == NULL, missed++, lastdrop = 2 (2+1-1), slot0 = [-] + * iter 2: slot0 == NULL, missed++, lastdrop = 3 (2+2-1), slot0 = [c] + * 2 < 3, extra = 1 (3-2), missed += extra, lastdrop = 4 (2+2+1-1) + * + * We receive seq number 8 + * distance == 6 [inseq(8) - w_wantseq(2)] + * newslot == distance + * index == 5 [distance(6) - 1] + * beyond == 3 [newslot(6) - lastslot((nslots(4) - 1))] + * shifting == 2 [min(savedlen(2), beyond(3)] + * + * slot0_skb == NULL changed from [b] when "savedlen < beyond" is true. + */ + + /* Now send any packets that are being shifted out of saved, and account + * for missing packets that are exiting the window as we shift it. + */ + + distance = inseq - wantseq; + beyond = distance - (nslots - 1); + + /* If savedlen > beyond we are shifting some, else all. */ + shifting = min(savedlen, beyond); + + /* slot0 is the buf that just shifted out and into slot0 */ + slot0 = NULL; + wnext = xtfs->w_saved; + for (slot = 1; slot <= shifting; slot++, wnext++) { + /* handle what was in slot0 before we occupy it */ + if (slot0) + list_add_tail(&slot0->list, list); + slot0 = wnext->skb; + wnext->skb = NULL; + } + + /* slot0 is now either NULL (in which case it's what we now are waiting + * for, or a buf in which case we need to handle it like we received it; + * however, we may be advancing past that buffer as well.. + */ + + /* Handle case where we need to shift more than we had saved, slot0 will + * be NULL iff savedlen is 0, otherwise slot0 will always be + * non-NULL b/c we shifted the final element, which is always set if + * there is any saved, into slot0. + */ + if (savedlen < beyond) { + if (savedlen != 0) + list_add_tail(&slot0->list, list); + slot0 = NULL; + /* slot0 has had an empty slot pushed into it */ + } + + /* Remove the entries */ + __vec_shift(xtfs, beyond); + + /* Advance want seq */ + xtfs->w_wantseq += beyond; + + /* Process drops here when implementing congestion control */ + + /* We've shifted. plug the packet in at the end. */ + xtfs->w_savedlen = nslots - 1; + xtfs->w_saved[xtfs->w_savedlen - 1].skb = inskb; + iptfs_set_window_drop_times(xtfs, xtfs->w_savedlen - 1); + + /* if we don't have a slot0 then we must wait for it */ + if (!slot0) + return; + + /* If slot0, seq must match new want seq */ + + /* slot0 is valid, treat like we received expected. */ + __reorder_this(xtfs, slot0, list); +} + +/* Receive a new packet into the reorder window. Return a list of ordered + * packets from the window. + */ +static void iptfs_input_reorder(struct xfrm_iptfs_data *xtfs, + struct sk_buff *inskb, struct list_head *list, + struct list_head *freelist) +{ + const u32 nslots = xtfs->cfg.reorder_win_size + 1; + u64 inseq = __esp_seq(inskb); + u64 wantseq; + + assert_spin_locked(&xtfs->drop_lock); + + if (unlikely(!xtfs->w_seq_set)) { + xtfs->w_seq_set = true; + xtfs->w_wantseq = inseq; + } + wantseq = xtfs->w_wantseq; + + if (likely(inseq == wantseq)) + __reorder_this(xtfs, inskb, list); + else if (inseq < wantseq) + __reorder_past(xtfs, inskb, freelist); + else if ((inseq - wantseq) < nslots) + __reorder_future_fits(xtfs, inskb, freelist); + else + __reorder_future_shifts(xtfs, inskb, list); +} + +/** + * iptfs_drop_timer() - Handle drop timer expiry. + * @me: the timer + * + * This is similar to our input function. + * + * The drop timer is set when we start an in progress reassembly, and also when + * we save a future packet in the window saved array. + * + * NOTE packets in the save window are always newer WRT drop times as + * they get further in the future. i.e. for: + * + * if slots (S0, S1, ... Sn) and `Dn` is the drop time for slot `Sn`, + * then D(n-1) <= D(n). + * + * So, regardless of why the timer is firing we can always discard any inprogress + * fragment; either it's the reassembly timer, or slot 0 is going to be + * dropped as S0 must have the most recent drop time, and slot 0 holds the + * continuation fragment of the in progress packet. + * + * Returns HRTIMER_NORESTART. + */ +static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me) +{ + struct sk_buff *skb, *next; + struct list_head list; + struct xfrm_iptfs_data *xtfs; + struct xfrm_state *x; + u32 count; + + xtfs = container_of(me, typeof(*xtfs), drop_timer); + x = xtfs->x; + + INIT_LIST_HEAD(&list); + + spin_lock(&xtfs->drop_lock); + + /* Drop any in progress packet */ + skb = xtfs->ra_newskb; + xtfs->ra_newskb = NULL; + + /* Now drop as many packets as we should from the reordering window + * saved array + */ + count = xtfs->w_savedlen ? __reorder_drop(xtfs, &list) : 0; + + spin_unlock(&xtfs->drop_lock); + + if (skb) + kfree_skb_reason(skb, SKB_DROP_REASON_FRAG_REASM_TIMEOUT); + + if (count) { + list_for_each_entry_safe(skb, next, &list, list) { + skb_list_del_init(skb); + iptfs_input_ordered(x, skb); + } + } + + return HRTIMER_NORESTART; +} + +/** + * iptfs_input() - handle receipt of iptfs payload + * @x: xfrm state + * @skb: the packet + * + * We have an IPTFS payload order it if needed, then process newly in order + * packets. + * + * Return: -EINPROGRESS to inform xfrm_input to stop processing the skb. + */ +static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) +{ + struct list_head freelist, list; + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct sk_buff *next; + + /* Fast path for no reorder window. */ + if (xtfs->cfg.reorder_win_size == 0) { + iptfs_input_ordered(x, skb); + goto done; + } + + /* Fetch list of in-order packets from the reordering window as well as + * a list of buffers we need to now free. + */ + INIT_LIST_HEAD(&list); + INIT_LIST_HEAD(&freelist); + + spin_lock(&xtfs->drop_lock); + iptfs_input_reorder(xtfs, skb, &list, &freelist); + spin_unlock(&xtfs->drop_lock); + + list_for_each_entry_safe(skb, next, &list, list) { + skb_list_del_init(skb); + iptfs_input_ordered(x, skb); + } + + list_for_each_entry_safe(skb, next, &freelist, list) { + skb_list_del_init(skb); + kfree_skb(skb); + } +done: + /* We always have dealt with the input SKB, either we are re-using it, + * or we have freed it. Return EINPROGRESS so that xfrm_input stops + * processing it. + */ + return -EINPROGRESS; +} + +/* ================================= */ +/* IPTFS Sending (ingress) Functions */ +/* ================================= */ + +/* ------------------------- */ +/* Enqueue to send functions */ +/* ------------------------- */ + +/** + * iptfs_enqueue() - enqueue packet if ok to send. + * @xtfs: xtfs state + * @skb: the packet + * + * Return: true if packet enqueued. + */ +static bool iptfs_enqueue(struct xfrm_iptfs_data *xtfs, struct sk_buff *skb) +{ + u64 newsz = xtfs->queue_size + skb->len; + struct iphdr *iph; + + assert_spin_locked(&xtfs->x->lock); + + if (newsz > xtfs->cfg.max_queue_size) + return false; + + /* Set ECN CE if we are above our ECN queue threshold */ + if (newsz > xtfs->ecn_queue_size) { + iph = ip_hdr(skb); + if (iph->version == 4) + IP_ECN_set_ce(iph); + else if (iph->version == 6) + IP6_ECN_set_ce(skb, ipv6_hdr(skb)); + } + + __skb_queue_tail(&xtfs->queue, skb); + xtfs->queue_size += skb->len; + return true; +} + +static int iptfs_get_cur_pmtu(struct xfrm_state *x, struct xfrm_iptfs_data *xtfs, + struct sk_buff *skb) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)skb_dst(skb); + u32 payload_mtu = xtfs->payload_mtu; + u32 pmtu = __iptfs_get_inner_mtu(x, xdst->child_mtu_cached); + + if (payload_mtu && payload_mtu < pmtu) + pmtu = payload_mtu; + + return pmtu; +} + +static int iptfs_is_too_big(struct sock *sk, struct sk_buff *skb, u32 pmtu) +{ + if (skb->len <= pmtu) + return 0; + + /* We only send ICMP too big if the user has configured us as + * dont-fragment. + */ + if (skb->dev) + XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMOUTERROR); + + if (sk) + xfrm_local_error(skb, pmtu); + else if (ip_hdr(skb)->version == 4) + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(pmtu)); + else + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, pmtu); + + return 1; +} + +/* IPv4/IPv6 packet ingress to IPTFS tunnel, arrange to send in IPTFS payload + * (i.e., aggregating or fragmenting as appropriate). + * This is set in dst->output for an SA. + */ +static int iptfs_output_collect(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct sk_buff *segs, *nskb; + u32 pmtu = 0; + bool ok = true; + bool was_gso; + + /* We have hooked into dst_entry->output which means we have skipped the + * protocol specific netfilter (see xfrm4_output, xfrm6_output). + * when our timer runs we will end up calling xfrm_output directly on + * the encapsulated traffic. + * + * For both cases this is the NF_INET_POST_ROUTING hook which allows + * changing the skb->dst entry which then may not be xfrm based anymore + * in which case a REROUTED flag is set. and dst_output is called. + * + * For IPv6 we are also skipping fragmentation handling for local + * sockets, which may or may not be good depending on our tunnel DF + * setting. Normally with fragmentation supported we want to skip this + * fragmentation. + */ + + if (xtfs->cfg.dont_frag) + pmtu = iptfs_get_cur_pmtu(x, xtfs, skb); + + /* Break apart GSO skbs. If the queue is nearing full then we want the + * accounting and queuing to be based on the individual packets not on the + * aggregate GSO buffer. + */ + was_gso = skb_is_gso(skb); + if (!was_gso) { + segs = skb; + } else { + segs = skb_gso_segment(skb, 0); + if (IS_ERR_OR_NULL(segs)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); + kfree_skb(skb); + if (IS_ERR(segs)) + return PTR_ERR(segs); + return -EINVAL; + } + consume_skb(skb); + skb = NULL; + } + + /* We can be running on multiple cores and from the network softirq or + * from user context depending on where the packet is coming from. + */ + spin_lock_bh(&x->lock); + + skb_list_walk_safe(segs, skb, nskb) { + skb_mark_not_on_list(skb); + + /* Once we drop due to no queue space we continue to drop the + * rest of the packets from that GRO. + */ + if (!ok) { +nospace: + trace_iptfs_no_queue_space(skb, xtfs, pmtu, was_gso); + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOQSPACE); + kfree_skb_reason(skb, SKB_DROP_REASON_FULL_RING); + continue; + } + + /* If the user indicated no iptfs fragmenting check before + * enqueue. + */ + if (xtfs->cfg.dont_frag && iptfs_is_too_big(sk, skb, pmtu)) { + trace_iptfs_too_big(skb, xtfs, pmtu, was_gso); + kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); + continue; + } + + /* Enqueue to send in tunnel */ + ok = iptfs_enqueue(xtfs, skb); + if (!ok) + goto nospace; + + trace_iptfs_enqueue(skb, xtfs, pmtu, was_gso); + } + + /* Start a delay timer if we don't have one yet */ + if (!hrtimer_is_queued(&xtfs->iptfs_timer)) { + hrtimer_start(&xtfs->iptfs_timer, xtfs->init_delay_ns, IPTFS_HRTIMER_MODE); + xtfs->iptfs_settime = ktime_get_raw_fast_ns(); + trace_iptfs_timer_start(xtfs, xtfs->init_delay_ns); + } + + spin_unlock_bh(&x->lock); + return 0; +} + +/* -------------------------- */ +/* Dequeue and send functions */ +/* -------------------------- */ + +static void iptfs_output_prepare_skb(struct sk_buff *skb, u32 blkoff) +{ + struct ip_iptfs_hdr *h; + size_t hsz = sizeof(*h); + + /* now reset values to be pointing at the rest of the packets */ + h = skb_push(skb, hsz); + memset(h, 0, hsz); + if (blkoff) + h->block_offset = htons(blkoff); + + /* network_header current points at the inner IP packet + * move it to the iptfs header + */ + skb->transport_header = skb->network_header; + skb->network_header -= hsz; + + IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; +} + +/** + * iptfs_copy_create_frag() - create an inner fragment skb. + * @st: The source packet data. + * @offset: offset in @st of the new fragment data. + * @copy_len: the amount of data to copy from @st. + * + * Create a new skb holding a single IPTFS inner packet fragment. @copy_len must + * not be greater than the max fragment size. + * + * Return: the new fragment skb or an ERR_PTR(). + */ +static struct sk_buff *iptfs_copy_create_frag(struct skb_seq_state *st, u32 offset, u32 copy_len) +{ + struct sk_buff *src = st->root_skb; + struct sk_buff *skb; + int err; + + skb = iptfs_alloc_skb(src, copy_len, true); + if (!skb) + return ERR_PTR(-ENOMEM); + + /* Now copy `copy_len` data from src */ + err = skb_copy_seq_read(st, offset, skb_put(skb, copy_len), copy_len); + if (err) { + kfree_skb(skb); + return ERR_PTR(err); + } + + return skb; +} + +/** + * iptfs_copy_create_frags() - create and send N-1 fragments of a larger skb. + * @skbp: the source packet skb (IN), skb holding the last fragment in + * the fragment stream (OUT). + * @xtfs: IPTFS SA state. + * @mtu: the max IPTFS fragment size. + * + * This function is responsible for fragmenting a larger inner packet into a + * sequence of IPTFS payload packets. The last fragment is returned rather than + * being sent so that the caller can append more inner packets (aggregation) if + * there is room. + * + * Return: 0 on success or a negative error code on failure + */ +static int iptfs_copy_create_frags(struct sk_buff **skbp, struct xfrm_iptfs_data *xtfs, u32 mtu) +{ + struct skb_seq_state skbseq; + struct list_head sublist; + struct sk_buff *skb = *skbp; + struct sk_buff *nskb = *skbp; + u32 copy_len, offset; + u32 to_copy = skb->len - mtu; + u32 blkoff = 0; + int err = 0; + + INIT_LIST_HEAD(&sublist); + + skb_prepare_seq_read(skb, 0, skb->len, &skbseq); + + /* A trimmed `skb` will be sent as the first fragment, later. */ + offset = mtu; + to_copy = skb->len - offset; + while (to_copy) { + /* Send all but last fragment to allow agg. append */ + trace_iptfs_first_fragmenting(nskb, mtu, to_copy, NULL); + list_add_tail(&nskb->list, &sublist); + + /* FUTURE: if the packet has an odd/non-aligning length we could + * send less data in the penultimate fragment so that the last + * fragment then ends on an aligned boundary. + */ + copy_len = min(to_copy, mtu); + nskb = iptfs_copy_create_frag(&skbseq, offset, copy_len); + if (IS_ERR(nskb)) { + XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMOUTERROR); + skb_abort_seq_read(&skbseq); + err = PTR_ERR(nskb); + nskb = NULL; + break; + } + iptfs_output_prepare_skb(nskb, to_copy); + offset += copy_len; + to_copy -= copy_len; + blkoff = to_copy; + } + skb_abort_seq_read(&skbseq); + + /* return last fragment that will be unsent (or NULL) */ + *skbp = nskb; + if (nskb) + trace_iptfs_first_final_fragment(nskb, mtu, blkoff, NULL); + + /* trim the original skb to MTU */ + if (!err) + err = pskb_trim(skb, mtu); + + if (err) { + /* Free all frags. Don't bother sending a partial packet we will + * never complete. + */ + kfree_skb(nskb); + list_for_each_entry_safe(skb, nskb, &sublist, list) { + skb_list_del_init(skb); + kfree_skb(skb); + } + return err; + } + + /* prepare the initial fragment with an iptfs header */ + iptfs_output_prepare_skb(skb, 0); + + /* Send all but last fragment, if we fail to send a fragment then free + * the rest -- no point in sending a packet that can't be reassembled. + */ + list_for_each_entry_safe(skb, nskb, &sublist, list) { + skb_list_del_init(skb); + if (!err) + err = xfrm_output(NULL, skb); + else + kfree_skb(skb); + } + if (err) + kfree_skb(*skbp); + return err; +} + +/** + * iptfs_first_skb() - handle the first dequeued inner packet for output + * @skbp: the source packet skb (IN), skb holding the last fragment in + * the fragment stream (OUT). + * @xtfs: IPTFS SA state. + * @mtu: the max IPTFS fragment size. + * + * This function is responsible for fragmenting a larger inner packet into a + * sequence of IPTFS payload packets. + * + * The last fragment is returned rather than being sent so that the caller can + * append more inner packets (aggregation) if there is room. + * + * Return: 0 on success or a negative error code on failure + */ +static int iptfs_first_skb(struct sk_buff **skbp, struct xfrm_iptfs_data *xtfs, u32 mtu) +{ + struct sk_buff *skb = *skbp; + int err; + + /* Classic ESP skips the don't fragment ICMP error if DF is clear on + * the inner packet or ignore_df is set. Otherwise it will send an ICMP + * or local error if the inner packet won't fit it's MTU. + * + * With IPTFS we do not care about the inner packet DF bit. If the + * tunnel is configured to "don't fragment" we error back if things + * don't fit in our max packet size. Otherwise we iptfs-fragment as + * normal. + */ + + /* The opportunity for HW offload has ended */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + err = skb_checksum_help(skb); + if (err) + return err; + } + + /* We've split gso up before queuing */ + + trace_iptfs_first_dequeue(skb, mtu, 0, ip_hdr(skb)); + + /* Consider the buffer Tx'd and no longer owned */ + skb_orphan(skb); + + /* Simple case -- it fits. `mtu` accounted for all the overhead + * including the basic IPTFS header. + */ + if (skb->len <= mtu) { + iptfs_output_prepare_skb(skb, 0); + return 0; + } + + return iptfs_copy_create_frags(skbp, xtfs, mtu); +} + +static struct sk_buff **iptfs_rehome_fraglist(struct sk_buff **nextp, struct sk_buff *child) +{ + u32 fllen = 0; + + /* It might be possible to account for a frag list in addition to page + * fragment if it's a valid state to be in. The page fragments size + * should be kept as data_len so only the frag_list size is removed, + * this must be done above as well. + */ + *nextp = skb_shinfo(child)->frag_list; + while (*nextp) { + fllen += (*nextp)->len; + nextp = &(*nextp)->next; + } + skb_frag_list_init(child); + child->len -= fllen; + child->data_len -= fllen; + + return nextp; +} + +static void iptfs_consume_frags(struct sk_buff *to, struct sk_buff *from) +{ + struct skb_shared_info *fromi = skb_shinfo(from); + struct skb_shared_info *toi = skb_shinfo(to); + unsigned int new_truesize; + + /* If we have data in a head page, grab it */ + if (!skb_headlen(from)) { + new_truesize = SKB_TRUESIZE(skb_end_offset(from)); + } else { + iptfs_skb_head_to_frag(from, &toi->frags[toi->nr_frags]); + skb_frag_ref(to, toi->nr_frags++); + new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff)); + } + + /* Move any other page fragments rather than copy */ + memcpy(&toi->frags[toi->nr_frags], fromi->frags, + sizeof(fromi->frags[0]) * fromi->nr_frags); + toi->nr_frags += fromi->nr_frags; + fromi->nr_frags = 0; + from->data_len = 0; + from->len = 0; + to->truesize += from->truesize - new_truesize; + from->truesize = new_truesize; + + /* We are done with this SKB */ + consume_skb(from); +} + +static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) +{ + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct sk_buff *skb, *skb2, **nextp; + struct skb_shared_info *shi, *shi2; + + /* If we are fragmenting due to a large inner packet we will output all + * the outer IPTFS packets required to contain the fragments of the + * single large inner packet. These outer packets need to be sent + * consecutively (ESP seq-wise). Since this output function is always + * running from a timer we do not need a lock to provide this guarantee. + * We will output our packets consecutively before the timer is allowed + * to run again on some other CPU. + */ + + while ((skb = __skb_dequeue(list))) { + u32 mtu = iptfs_get_cur_pmtu(x, xtfs, skb); + bool share_ok = true; + int remaining; + + /* protocol comes to us cleared sometimes */ + skb->protocol = x->outer_mode.family == AF_INET ? htons(ETH_P_IP) : + htons(ETH_P_IPV6); + + if (skb->len > mtu && xtfs->cfg.dont_frag) { + /* We handle this case before enqueueing so we are only + * here b/c MTU changed after we enqueued before we + * dequeued, just drop these. + */ + XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTERROR); + + trace_iptfs_first_toobig(skb, mtu, 0, ip_hdr(skb)); + kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); + continue; + } + + /* Convert first inner packet into an outer IPTFS packet, + * dealing with any fragmentation into multiple outer packets + * if necessary. + */ + if (iptfs_first_skb(&skb, xtfs, mtu)) + continue; + + /* If fragmentation was required the returned skb is the last + * IPTFS fragment in the chain, and it's IPTFS header blkoff has + * been set just past the end of the fragment data. + * + * In either case the space remaining to send more inner packet + * data is `mtu` - (skb->len - sizeof iptfs header). This is b/c + * the `mtu` value has the basic IPTFS header len accounted for, + * and we added that header to the skb so it is a part of + * skb->len, thus we subtract it from the skb length. + */ + remaining = mtu - (skb->len - sizeof(struct ip_iptfs_hdr)); + + /* Re-home (un-nest) nested fragment lists. We need to do this + * b/c we will simply be appending any following aggregated + * inner packets using the frag list. + */ + shi = skb_shinfo(skb); + nextp = &shi->frag_list; + while (*nextp) { + if (skb_has_frag_list(*nextp)) + nextp = iptfs_rehome_fraglist(&(*nextp)->next, *nextp); + else + nextp = &(*nextp)->next; + } + + if (shi->frag_list || skb_cloned(skb) || skb_shared(skb)) + share_ok = false; + + /* See if we have enough space to simply append. + * + * NOTE: Maybe do not append if we will be mis-aligned, + * SW-based endpoints will probably have to copy in this + * case. + */ + while ((skb2 = skb_peek(list))) { + trace_iptfs_ingress_nth_peek(skb2, remaining); + if (skb2->len > remaining) + break; + + __skb_unlink(skb2, list); + + /* Consider the buffer Tx'd and no longer owned */ + skb_orphan(skb); + + /* If we don't have a cksum in the packet we need to add + * one before encapsulation. + */ + if (skb2->ip_summed == CHECKSUM_PARTIAL) { + if (skb_checksum_help(skb2)) { + XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTERROR); + kfree_skb(skb2); + continue; + } + } + + /* skb->pp_recycle is passed to __skb_flag_unref for all + * frag pages so we can only share pages with skb's who + * match ourselves. + */ + shi2 = skb_shinfo(skb2); + if (share_ok && + (shi2->frag_list || + (!skb2->head_frag && skb_headlen(skb)) || + skb->pp_recycle != skb2->pp_recycle || + skb_zcopy(skb2) || + (shi->nr_frags + shi2->nr_frags + 1 > MAX_SKB_FRAGS))) + share_ok = false; + + /* Do accounting */ + skb->data_len += skb2->len; + skb->len += skb2->len; + remaining -= skb2->len; + + trace_iptfs_ingress_nth_add(skb2, share_ok); + + if (share_ok) { + iptfs_consume_frags(skb, skb2); + } else { + /* Append to the frag_list */ + *nextp = skb2; + nextp = &skb2->next; + if (skb_has_frag_list(skb2)) + nextp = iptfs_rehome_fraglist(nextp, + skb2); + skb->truesize += skb2->truesize; + } + } + + xfrm_output(NULL, skb); + } +} + +static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me) +{ + struct sk_buff_head list; + struct xfrm_iptfs_data *xtfs; + struct xfrm_state *x; + time64_t settime; + + xtfs = container_of(me, typeof(*xtfs), iptfs_timer); + x = xtfs->x; + + /* Process all the queued packets + * + * softirq execution order: timer > tasklet > hrtimer + * + * Network rx will have run before us giving one last chance to queue + * ingress packets for us to process and transmit. + */ + + spin_lock(&x->lock); + __skb_queue_head_init(&list); + skb_queue_splice_init(&xtfs->queue, &list); + xtfs->queue_size = 0; + settime = xtfs->iptfs_settime; + spin_unlock(&x->lock); + + /* After the above unlock, packets can begin queuing again, and the + * timer can be set again, from another CPU either in softirq or user + * context (not from this one since we are running at softirq level + * already). + */ + + trace_iptfs_timer_expire(xtfs, (unsigned long long)(ktime_get_raw_fast_ns() - settime)); + + iptfs_output_queued(x, &list); + + return HRTIMER_NORESTART; +} + +/** + * iptfs_encap_add_ipv4() - add outer encaps + * @x: xfrm state + * @skb: the packet + * + * This was originally taken from xfrm4_tunnel_encap_add. The reason for the + * copy is that IP-TFS/AGGFRAG can have different functionality for how to set + * the TOS/DSCP bits. Sets the protocol to a different value and doesn't do + * anything with inner headers as they aren't pointing into a normal IP + * singleton inner packet. + * + * Return: 0 on success or a negative error code on failure + */ +static int iptfs_encap_add_ipv4(struct xfrm_state *x, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct iphdr *top_iph; + + skb_reset_inner_network_header(skb); + skb_reset_inner_transport_header(skb); + + skb_set_network_header(skb, -(x->props.header_len - x->props.enc_hdr_len)); + skb->mac_header = skb->network_header + offsetof(struct iphdr, protocol); + skb->transport_header = skb->network_header + sizeof(*top_iph); + + top_iph = ip_hdr(skb); + top_iph->ihl = 5; + top_iph->version = 4; + top_iph->protocol = IPPROTO_AGGFRAG; + + /* As we have 0, fractional, 1 or N inner packets there's no obviously + * correct DSCP mapping to inherit. ECN should be cleared per RFC9347 + * 3.1. + */ + top_iph->tos = 0; + + top_iph->frag_off = htons(IP_DF); + top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst)); + top_iph->saddr = x->props.saddr.a4; + top_iph->daddr = x->id.daddr.a4; + ip_select_ident(dev_net(dst->dev), skb, NULL); + + return 0; +} + +#if IS_ENABLED(CONFIG_IPV6) +/** + * iptfs_encap_add_ipv6() - add outer encaps + * @x: xfrm state + * @skb: the packet + * + * This was originally taken from xfrm6_tunnel_encap_add. The reason for the + * copy is that IP-TFS/AGGFRAG can have different functionality for how to set + * the flow label and TOS/DSCP bits. It also sets the protocol to a different + * value and doesn't do anything with inner headers as they aren't pointing into + * a normal IP singleton inner packet. + * + * Return: 0 on success or a negative error code on failure + */ +static int iptfs_encap_add_ipv6(struct xfrm_state *x, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct ipv6hdr *top_iph; + int dsfield; + + skb_reset_inner_network_header(skb); + skb_reset_inner_transport_header(skb); + + skb_set_network_header(skb, -x->props.header_len + x->props.enc_hdr_len); + skb->mac_header = skb->network_header + offsetof(struct ipv6hdr, nexthdr); + skb->transport_header = skb->network_header + sizeof(*top_iph); + + top_iph = ipv6_hdr(skb); + top_iph->version = 6; + top_iph->priority = 0; + memset(top_iph->flow_lbl, 0, sizeof(top_iph->flow_lbl)); + top_iph->nexthdr = IPPROTO_AGGFRAG; + + /* As we have 0, fractional, 1 or N inner packets there's no obviously + * correct DSCP mapping to inherit. ECN should be cleared per RFC9347 + * 3.1. + */ + dsfield = 0; + ipv6_change_dsfield(top_iph, 0, dsfield); + + top_iph->hop_limit = ip6_dst_hoplimit(xfrm_dst_child(dst)); + top_iph->saddr = *(struct in6_addr *)&x->props.saddr; + top_iph->daddr = *(struct in6_addr *)&x->id.daddr; + + return 0; +} +#endif + +/** + * iptfs_prepare_output() - prepare the skb for output + * @x: xfrm state + * @skb: the packet + * + * Return: Error value, if 0 then skb values should be as follows: + * - transport_header should point at ESP header + * - network_header should point at Outer IP header + * - mac_header should point at protocol/nexthdr of the outer IP + */ +static int iptfs_prepare_output(struct xfrm_state *x, struct sk_buff *skb) +{ + if (x->outer_mode.family == AF_INET) + return iptfs_encap_add_ipv4(x, skb); + if (x->outer_mode.family == AF_INET6) { +#if IS_ENABLED(CONFIG_IPV6) + return iptfs_encap_add_ipv6(x, skb); +#else + return -EAFNOSUPPORT; +#endif + } + return -EOPNOTSUPP; +} + +/* ========================== */ +/* State Management Functions */ +/* ========================== */ + +/** + * __iptfs_get_inner_mtu() - return inner MTU with no fragmentation. + * @x: xfrm state. + * @outer_mtu: the outer mtu + * + * Return: Correct MTU taking in to account the encap overhead. + */ +static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu) +{ + struct crypto_aead *aead; + u32 blksize; + + aead = x->data; + blksize = ALIGN(crypto_aead_blocksize(aead), 4); + return ((outer_mtu - x->props.header_len - crypto_aead_authsize(aead)) & + ~(blksize - 1)) - 2; +} + +/** + * iptfs_get_inner_mtu() - return the inner MTU for an IPTFS xfrm. + * @x: xfrm state. + * @outer_mtu: Outer MTU for the encapsulated packet. + * + * Return: Correct MTU taking in to account the encap overhead. + */ +static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu) +{ + struct xfrm_iptfs_data *xtfs = x->mode_data; + + /* If not dont-frag we have no MTU */ + if (!xtfs->cfg.dont_frag) + return x->outer_mode.family == AF_INET ? IP_MAX_MTU : IP6_MAX_MTU; + return __iptfs_get_inner_mtu(x, outer_mtu); +} + +/** + * iptfs_user_init() - initialize the SA with IPTFS options from netlink. + * @net: the net data + * @x: xfrm state + * @attrs: netlink attributes + * @extack: extack return data + * + * Return: 0 on success or a negative error code on failure + */ +static int iptfs_user_init(struct net *net, struct xfrm_state *x, + struct nlattr **attrs, + struct netlink_ext_ack *extack) +{ + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct xfrm_iptfs_config *xc; + u64 q; + + xc = &xtfs->cfg; + xc->max_queue_size = IPTFS_DEFAULT_MAX_QUEUE_SIZE; + xc->reorder_win_size = IPTFS_DEFAULT_REORDER_WINDOW; + xtfs->drop_time_ns = IPTFS_DEFAULT_DROP_TIME_USECS * NSECS_IN_USEC; + xtfs->init_delay_ns = IPTFS_DEFAULT_INIT_DELAY_USECS * NSECS_IN_USEC; + + if (attrs[XFRMA_IPTFS_DONT_FRAG]) + xc->dont_frag = true; + if (attrs[XFRMA_IPTFS_REORDER_WINDOW]) + xc->reorder_win_size = + nla_get_u16(attrs[XFRMA_IPTFS_REORDER_WINDOW]); + /* saved array is for saving 1..N seq nums from wantseq */ + if (xc->reorder_win_size) { + xtfs->w_saved = kcalloc(xc->reorder_win_size, + sizeof(*xtfs->w_saved), GFP_KERNEL); + if (!xtfs->w_saved) { + NL_SET_ERR_MSG(extack, "Cannot alloc reorder window"); + return -ENOMEM; + } + } + if (attrs[XFRMA_IPTFS_PKT_SIZE]) { + xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]); + if (!xc->pkt_size) { + xtfs->payload_mtu = 0; + } else if (xc->pkt_size > x->props.header_len) { + xtfs->payload_mtu = xc->pkt_size - x->props.header_len; + } else { + NL_SET_ERR_MSG(extack, + "Packet size must be 0 or greater than IPTFS/ESP header length"); + return -EINVAL; + } + } + if (attrs[XFRMA_IPTFS_MAX_QSIZE]) + xc->max_queue_size = nla_get_u32(attrs[XFRMA_IPTFS_MAX_QSIZE]); + if (attrs[XFRMA_IPTFS_DROP_TIME]) + xtfs->drop_time_ns = + (u64)nla_get_u32(attrs[XFRMA_IPTFS_DROP_TIME]) * + NSECS_IN_USEC; + if (attrs[XFRMA_IPTFS_INIT_DELAY]) + xtfs->init_delay_ns = + (u64)nla_get_u32(attrs[XFRMA_IPTFS_INIT_DELAY]) * NSECS_IN_USEC; + + q = (u64)xc->max_queue_size * 95; + do_div(q, 100); + xtfs->ecn_queue_size = (u32)q; + + return 0; +} + +static unsigned int iptfs_sa_len(const struct xfrm_state *x) +{ + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct xfrm_iptfs_config *xc = &xtfs->cfg; + unsigned int l = 0; + + if (x->dir == XFRM_SA_DIR_IN) { + l += nla_total_size(sizeof(u32)); /* drop time usec */ + l += nla_total_size(sizeof(xc->reorder_win_size)); + } else { + if (xc->dont_frag) + l += nla_total_size(0); /* dont-frag flag */ + l += nla_total_size(sizeof(u32)); /* init delay usec */ + l += nla_total_size(sizeof(xc->max_queue_size)); + l += nla_total_size(sizeof(xc->pkt_size)); + } + + return l; +} + +static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) +{ + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct xfrm_iptfs_config *xc = &xtfs->cfg; + int ret = 0; + u64 q; + + if (x->dir == XFRM_SA_DIR_IN) { + q = xtfs->drop_time_ns; + do_div(q, NSECS_IN_USEC); + ret = nla_put_u32(skb, XFRMA_IPTFS_DROP_TIME, q); + if (ret) + return ret; + + ret = nla_put_u16(skb, XFRMA_IPTFS_REORDER_WINDOW, + xc->reorder_win_size); + } else { + if (xc->dont_frag) { + ret = nla_put_flag(skb, XFRMA_IPTFS_DONT_FRAG); + if (ret) + return ret; + } + + q = xtfs->init_delay_ns; + do_div(q, NSECS_IN_USEC); + ret = nla_put_u32(skb, XFRMA_IPTFS_INIT_DELAY, q); + if (ret) + return ret; + + ret = nla_put_u32(skb, XFRMA_IPTFS_MAX_QSIZE, xc->max_queue_size); + if (ret) + return ret; + + ret = nla_put_u32(skb, XFRMA_IPTFS_PKT_SIZE, xc->pkt_size); + } + + return ret; +} + +static void __iptfs_init_state(struct xfrm_state *x, + struct xfrm_iptfs_data *xtfs) +{ + __skb_queue_head_init(&xtfs->queue); + hrtimer_init(&xtfs->iptfs_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); + xtfs->iptfs_timer.function = iptfs_delay_timer; + + spin_lock_init(&xtfs->drop_lock); + hrtimer_init(&xtfs->drop_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); + xtfs->drop_timer.function = iptfs_drop_timer; + + /* Modify type (esp) adjustment values */ + + if (x->props.family == AF_INET) + x->props.header_len += sizeof(struct iphdr) + sizeof(struct ip_iptfs_hdr); + else if (x->props.family == AF_INET6) + x->props.header_len += sizeof(struct ipv6hdr) + sizeof(struct ip_iptfs_hdr); + x->props.enc_hdr_len = sizeof(struct ip_iptfs_hdr); + + /* Always keep a module reference when x->mode_data is set */ + __module_get(x->mode_cbs->owner); + + x->mode_data = xtfs; + xtfs->x = x; +} + +static int iptfs_clone_state(struct xfrm_state *x, struct xfrm_state *orig) +{ + struct xfrm_iptfs_data *xtfs; + + xtfs = kmemdup(orig->mode_data, sizeof(*xtfs), GFP_KERNEL); + if (!xtfs) + return -ENOMEM; + + x->mode_data = xtfs; + xtfs->x = x; + + xtfs->ra_newskb = NULL; + if (xtfs->cfg.reorder_win_size) { + xtfs->w_saved = kcalloc(xtfs->cfg.reorder_win_size, + sizeof(*xtfs->w_saved), GFP_KERNEL); + if (!xtfs->w_saved) { + kfree_sensitive(xtfs); + return -ENOMEM; + } + } + + return 0; +} + +static int iptfs_init_state(struct xfrm_state *x) +{ + struct xfrm_iptfs_data *xtfs; + + if (x->mode_data) { + /* We have arrived here from xfrm_state_clone() */ + xtfs = x->mode_data; + } else { + xtfs = kzalloc(sizeof(*xtfs), GFP_KERNEL); + if (!xtfs) + return -ENOMEM; + } + + __iptfs_init_state(x, xtfs); + + return 0; +} + +static void iptfs_destroy_state(struct xfrm_state *x) +{ + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct sk_buff_head list; + struct skb_wseq *s, *se; + struct sk_buff *skb; + + if (!xtfs) + return; + + spin_lock_bh(&xtfs->x->lock); + hrtimer_cancel(&xtfs->iptfs_timer); + __skb_queue_head_init(&list); + skb_queue_splice_init(&xtfs->queue, &list); + spin_unlock_bh(&xtfs->x->lock); + + while ((skb = __skb_dequeue(&list))) + kfree_skb(skb); + + spin_lock_bh(&xtfs->drop_lock); + hrtimer_cancel(&xtfs->drop_timer); + spin_unlock_bh(&xtfs->drop_lock); + + if (xtfs->ra_newskb) + kfree_skb(xtfs->ra_newskb); + + for (s = xtfs->w_saved, se = s + xtfs->w_savedlen; s < se; s++) { + if (s->skb) + kfree_skb(s->skb); + } + + kfree_sensitive(xtfs->w_saved); + kfree_sensitive(xtfs); + + module_put(x->mode_cbs->owner); +} + +static const struct xfrm_mode_cbs iptfs_mode_cbs = { + .owner = THIS_MODULE, + .init_state = iptfs_init_state, + .clone_state = iptfs_clone_state, + .destroy_state = iptfs_destroy_state, + .user_init = iptfs_user_init, + .copy_to_user = iptfs_copy_to_user, + .sa_len = iptfs_sa_len, + .get_inner_mtu = iptfs_get_inner_mtu, + .input = iptfs_input, + .output = iptfs_output_collect, + .prepare_output = iptfs_prepare_output, +}; + +static int __init xfrm_iptfs_init(void) +{ + int err; + + pr_info("xfrm_iptfs: IPsec IP-TFS tunnel mode module\n"); + + err = xfrm_register_mode_cbs(XFRM_MODE_IPTFS, &iptfs_mode_cbs); + if (err < 0) + pr_info("%s: can't register IP-TFS\n", __func__); + + return err; +} + +static void __exit xfrm_iptfs_fini(void) +{ + xfrm_unregister_mode_cbs(XFRM_MODE_IPTFS); +} + +module_init(xfrm_iptfs_init); +module_exit(xfrm_iptfs_fini); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IP-TFS support for xfrm ipsec tunnels"); diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index e5722c95b8bb..b5025cf6136e 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -472,6 +472,8 @@ static int xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb) WARN_ON_ONCE(1); break; default: + if (x->mode_cbs && x->mode_cbs->prepare_output) + return x->mode_cbs->prepare_output(x, skb); WARN_ON_ONCE(1); break; } @@ -675,6 +677,10 @@ static void xfrm_get_inner_ipproto(struct sk_buff *skb, struct xfrm_state *x) return; } + if (x->outer_mode.encap == XFRM_MODE_IPTFS) { + xo->inner_ipproto = IPPROTO_AGGFRAG; + return; + } /* non-Tunnel Mode */ if (!skb->encapsulation) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 4408c11c0835..9e510021ee91 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2497,6 +2497,7 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i]; if (tmpl->mode == XFRM_MODE_TUNNEL || + tmpl->mode == XFRM_MODE_IPTFS || tmpl->mode == XFRM_MODE_BEET) { remote = &tmpl->id.daddr; local = &tmpl->saddr; @@ -2748,13 +2749,17 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, dst1->input = dst_discard; - rcu_read_lock(); - afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family); - if (likely(afinfo)) - dst1->output = afinfo->output; - else - dst1->output = dst_discard_out; - rcu_read_unlock(); + if (xfrm[i]->mode_cbs && xfrm[i]->mode_cbs->output) { + dst1->output = xfrm[i]->mode_cbs->output; + } else { + rcu_read_lock(); + afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family); + if (likely(afinfo)) + dst1->output = afinfo->output; + else + dst1->output = dst_discard_out; + rcu_read_unlock(); + } xdst_prev = xdst; @@ -3290,7 +3295,8 @@ no_transform: ok: xfrm_pols_put(pols, drop_pols); if (dst && dst->xfrm && - dst->xfrm->props.mode == XFRM_MODE_TUNNEL) + (dst->xfrm->props.mode == XFRM_MODE_TUNNEL || + dst->xfrm->props.mode == XFRM_MODE_IPTFS)) dst->flags |= DST_XFRM_TUNNEL; return dst; @@ -4519,6 +4525,7 @@ static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tm switch (t->mode) { case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: + case XFRM_MODE_IPTFS: if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr, m->old_family) && xfrm_addr_equal(&t->saddr, &m->old_saddr, @@ -4561,7 +4568,8 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol, continue; n++; if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL && - pol->xfrm_vec[i].mode != XFRM_MODE_BEET) + pol->xfrm_vec[i].mode != XFRM_MODE_BEET && + pol->xfrm_vec[i].mode != XFRM_MODE_IPTFS) continue; /* update endpoints */ memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr, diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c index eeb984be03a7..8e07dd614b0b 100644 --- a/net/xfrm/xfrm_proc.c +++ b/net/xfrm/xfrm_proc.c @@ -43,6 +43,8 @@ static const struct snmp_mib xfrm_mib_list[] = { SNMP_MIB_ITEM("XfrmAcquireError", LINUX_MIB_XFRMACQUIREERROR), SNMP_MIB_ITEM("XfrmOutStateDirError", LINUX_MIB_XFRMOUTSTATEDIRERROR), SNMP_MIB_ITEM("XfrmInStateDirError", LINUX_MIB_XFRMINSTATEDIRERROR), + SNMP_MIB_ITEM("XfrmInIptfsError", LINUX_MIB_XFRMINIPTFSERROR), + SNMP_MIB_ITEM("XfrmOutNoQueueSpace", LINUX_MIB_XFRMOUTNOQSPACE), SNMP_MIB_SENTINEL }; diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index bc56c6305725..e500aebbad22 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -729,6 +729,7 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff } replay_esn->oseq = oseq; + xfrm_dev_state_advance_esn(x); if (xfrm_aevent_is_on(net)) xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 67ca7ac955a3..34067cb8a479 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -467,6 +467,11 @@ static const struct xfrm_mode xfrm4_mode_map[XFRM_MODE_MAX] = { .flags = XFRM_MODE_FLAG_TUNNEL, .family = AF_INET, }, + [XFRM_MODE_IPTFS] = { + .encap = XFRM_MODE_IPTFS, + .flags = XFRM_MODE_FLAG_TUNNEL, + .family = AF_INET, + }, }; static const struct xfrm_mode xfrm6_mode_map[XFRM_MODE_MAX] = { @@ -488,6 +493,11 @@ static const struct xfrm_mode xfrm6_mode_map[XFRM_MODE_MAX] = { .flags = XFRM_MODE_FLAG_TUNNEL, .family = AF_INET6, }, + [XFRM_MODE_IPTFS] = { + .encap = XFRM_MODE_IPTFS, + .flags = XFRM_MODE_FLAG_TUNNEL, + .family = AF_INET6, + }, }; static const struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family) @@ -515,6 +525,60 @@ static const struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family) return NULL; } +static const struct xfrm_mode_cbs __rcu *xfrm_mode_cbs_map[XFRM_MODE_MAX]; +static DEFINE_SPINLOCK(xfrm_mode_cbs_map_lock); + +int xfrm_register_mode_cbs(u8 mode, const struct xfrm_mode_cbs *mode_cbs) +{ + if (mode >= XFRM_MODE_MAX) + return -EINVAL; + + spin_lock_bh(&xfrm_mode_cbs_map_lock); + rcu_assign_pointer(xfrm_mode_cbs_map[mode], mode_cbs); + spin_unlock_bh(&xfrm_mode_cbs_map_lock); + + return 0; +} +EXPORT_SYMBOL(xfrm_register_mode_cbs); + +void xfrm_unregister_mode_cbs(u8 mode) +{ + if (mode >= XFRM_MODE_MAX) + return; + + spin_lock_bh(&xfrm_mode_cbs_map_lock); + RCU_INIT_POINTER(xfrm_mode_cbs_map[mode], NULL); + spin_unlock_bh(&xfrm_mode_cbs_map_lock); + synchronize_rcu(); +} +EXPORT_SYMBOL(xfrm_unregister_mode_cbs); + +static const struct xfrm_mode_cbs *xfrm_get_mode_cbs(u8 mode) +{ + const struct xfrm_mode_cbs *cbs; + bool try_load = true; + + if (mode >= XFRM_MODE_MAX) + return NULL; + +retry: + rcu_read_lock(); + + cbs = rcu_dereference(xfrm_mode_cbs_map[mode]); + if (cbs && !try_module_get(cbs->owner)) + cbs = NULL; + + rcu_read_unlock(); + + if (mode == XFRM_MODE_IPTFS && !cbs && try_load) { + request_module("xfrm-iptfs"); + try_load = false; + goto retry; + } + + return cbs; +} + void xfrm_state_free(struct xfrm_state *x) { kmem_cache_free(xfrm_state_cache, x); @@ -523,6 +587,8 @@ EXPORT_SYMBOL(xfrm_state_free); static void ___xfrm_state_destroy(struct xfrm_state *x) { + if (x->mode_cbs && x->mode_cbs->destroy_state) + x->mode_cbs->destroy_state(x); hrtimer_cancel(&x->mtimer); del_timer_sync(&x->rtimer); kfree(x->aead); @@ -682,6 +748,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) x->replay_maxdiff = 0; x->pcpu_num = UINT_MAX; spin_lock_init(&x->lock); + x->mode_data = NULL; } return x; } @@ -1945,6 +2012,12 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, x->new_mapping_sport = 0; x->dir = orig->dir; + x->mode_cbs = orig->mode_cbs; + if (x->mode_cbs && x->mode_cbs->clone_state) { + if (x->mode_cbs->clone_state(x, orig)) + goto error; + } + return x; error: @@ -2271,6 +2344,7 @@ static int __xfrm6_state_sort_cmp(const void *p) #endif case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: + case XFRM_MODE_IPTFS: return 4; } return 5; @@ -2297,6 +2371,7 @@ static int __xfrm6_tmpl_sort_cmp(const void *p) #endif case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: + case XFRM_MODE_IPTFS: return 3; } return 4; @@ -2986,6 +3061,9 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) case XFRM_MODE_TUNNEL: break; default: + if (x->mode_cbs && x->mode_cbs->get_inner_mtu) + return x->mode_cbs->get_inner_mtu(x, mtu); + WARN_ON_ONCE(1); break; } @@ -3086,6 +3164,12 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload, } } + x->mode_cbs = xfrm_get_mode_cbs(x->props.mode); + if (x->mode_cbs) { + if (x->mode_cbs->init_state) + err = x->mode_cbs->init_state(x); + module_put(x->mode_cbs->owner); + } error: return err; } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index b2876e09328b..08c6d6f0179f 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -301,6 +301,16 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, NL_SET_ERR_MSG(extack, "TFC padding can only be used in tunnel mode"); goto out; } + if ((attrs[XFRMA_IPTFS_DROP_TIME] || + attrs[XFRMA_IPTFS_REORDER_WINDOW] || + attrs[XFRMA_IPTFS_DONT_FRAG] || + attrs[XFRMA_IPTFS_INIT_DELAY] || + attrs[XFRMA_IPTFS_MAX_QSIZE] || + attrs[XFRMA_IPTFS_PKT_SIZE]) && + p->mode != XFRM_MODE_IPTFS) { + NL_SET_ERR_MSG(extack, "IP-TFS options can only be used in IP-TFS mode"); + goto out; + } break; case IPPROTO_COMP: @@ -373,6 +383,16 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, case XFRM_MODE_ROUTEOPTIMIZATION: case XFRM_MODE_BEET: break; + case XFRM_MODE_IPTFS: + if (p->id.proto != IPPROTO_ESP) { + NL_SET_ERR_MSG(extack, "IP-TFS mode only supported with ESP"); + goto out; + } + if (sa_dir == 0) { + NL_SET_ERR_MSG(extack, "IP-TFS mode requires in or out direction attribute"); + goto out; + } + break; default: NL_SET_ERR_MSG(extack, "Unsupported mode"); @@ -421,6 +441,18 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, goto out; } + if (attrs[XFRMA_IPTFS_DROP_TIME]) { + NL_SET_ERR_MSG(extack, "IP-TFS drop time should not be set for output SA"); + err = -EINVAL; + goto out; + } + + if (attrs[XFRMA_IPTFS_REORDER_WINDOW]) { + NL_SET_ERR_MSG(extack, "IP-TFS reorder window should not be set for output SA"); + err = -EINVAL; + goto out; + } + if (attrs[XFRMA_REPLAY_VAL]) { struct xfrm_replay_state *replay; @@ -458,6 +490,30 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, } } + + if (attrs[XFRMA_IPTFS_DONT_FRAG]) { + NL_SET_ERR_MSG(extack, "IP-TFS don't fragment should not be set for input SA"); + err = -EINVAL; + goto out; + } + + if (attrs[XFRMA_IPTFS_INIT_DELAY]) { + NL_SET_ERR_MSG(extack, "IP-TFS initial delay should not be set for input SA"); + err = -EINVAL; + goto out; + } + + if (attrs[XFRMA_IPTFS_MAX_QSIZE]) { + NL_SET_ERR_MSG(extack, "IP-TFS max queue size should not be set for input SA"); + err = -EINVAL; + goto out; + } + + if (attrs[XFRMA_IPTFS_PKT_SIZE]) { + NL_SET_ERR_MSG(extack, "IP-TFS packet size should not be set for input SA"); + err = -EINVAL; + goto out; + } } if (!sa_dir && attrs[XFRMA_SA_PCPU]) { @@ -886,6 +942,12 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, goto error; } + if (x->mode_cbs && x->mode_cbs->user_init) { + err = x->mode_cbs->user_init(net, x, attrs, extack); + if (err) + goto error; + } + return x; error: @@ -1301,6 +1363,10 @@ static int copy_to_user_state_extra(struct xfrm_state *x, if (ret) goto out; } + if (x->mode_cbs && x->mode_cbs->copy_to_user) + ret = x->mode_cbs->copy_to_user(x, skb); + if (ret) + goto out; if (x->mapping_maxage) { ret = nla_put_u32(skb, XFRMA_MTIMER_THRESH, x->mapping_maxage); if (ret) @@ -1958,6 +2024,8 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family, return -EINVAL; } break; + case XFRM_MODE_IPTFS: + break; default: if (ut[i].family != prev_family) { NL_SET_ERR_MSG(extack, "Mode in template doesn't support a family change"); @@ -3220,6 +3288,12 @@ const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT), [XFRMA_NAT_KEEPALIVE_INTERVAL] = { .type = NLA_U32 }, [XFRMA_SA_PCPU] = { .type = NLA_U32 }, + [XFRMA_IPTFS_DROP_TIME] = { .type = NLA_U32 }, + [XFRMA_IPTFS_REORDER_WINDOW] = { .type = NLA_U16 }, + [XFRMA_IPTFS_DONT_FRAG] = { .type = NLA_FLAG }, + [XFRMA_IPTFS_INIT_DELAY] = { .type = NLA_U32 }, + [XFRMA_IPTFS_MAX_QSIZE] = { .type = NLA_U32 }, + [XFRMA_IPTFS_PKT_SIZE] = { .type = NLA_U32 }, }; EXPORT_SYMBOL_GPL(xfrma_policy); @@ -3554,6 +3628,9 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x) if (x->nat_keepalive_interval) l += nla_total_size(sizeof(x->nat_keepalive_interval)); + if (x->mode_cbs && x->mode_cbs->sa_len) + l += x->mode_cbs->sa_len(x); + return l; } diff --git a/rust/kernel/workqueue.rs b/rust/kernel/workqueue.rs index 4d1d2062f6eb..fd3e97192ed8 100644 --- a/rust/kernel/workqueue.rs +++ b/rust/kernel/workqueue.rs @@ -519,7 +519,15 @@ impl_has_work! { impl{T} HasWork<Self> for ClosureWork<T> { self.work } } -// SAFETY: TODO. +// SAFETY: The `__enqueue` implementation in RawWorkItem uses a `work_struct` initialized with the +// `run` method of this trait as the function pointer because: +// - `__enqueue` gets the `work_struct` from the `Work` field, using `T::raw_get_work`. +// - The only safe way to create a `Work` object is through `Work::new`. +// - `Work::new` makes sure that `T::Pointer::run` is passed to `init_work_with_key`. +// - Finally `Work` and `RawWorkItem` guarantee that the correct `Work` field +// will be used because of the ID const generic bound. This makes sure that `T::raw_get_work` +// uses the correct offset for the `Work` field, and `Work::new` picks the correct +// implementation of `WorkItemPointer` for `Arc<T>`. unsafe impl<T, const ID: u64> WorkItemPointer<ID> for Arc<T> where T: WorkItem<ID, Pointer = Self>, @@ -537,7 +545,13 @@ where } } -// SAFETY: TODO. +// SAFETY: The `work_struct` raw pointer is guaranteed to be valid for the duration of the call to +// the closure because we get it from an `Arc`, which means that the ref count will be at least 1, +// and we don't drop the `Arc` ourselves. If `queue_work_on` returns true, it is further guaranteed +// to be valid until a call to the function pointer in `work_struct` because we leak the memory it +// points to, and only reclaim it if the closure returns false, or in `WorkItemPointer::run`, which +// is what the function pointer in the `work_struct` must be pointing to, according to the safety +// requirements of `WorkItemPointer`. unsafe impl<T, const ID: u64> RawWorkItem<ID> for Arc<T> where T: WorkItem<ID, Pointer = Self>, diff --git a/scripts/mksysmap b/scripts/mksysmap index c12723a04655..3accbdb269ac 100755 --- a/scripts/mksysmap +++ b/scripts/mksysmap @@ -26,7 +26,7 @@ # (do not forget a space before each pattern) # local symbols for ARM, MIPS, etc. -/ \\$/d +/ \$/d # local labels, .LBB, .Ltmpxxx, .L__unnamed_xx, .LASANPC, etc. / \.L/d @@ -39,7 +39,7 @@ / __pi_\.L/d # arm64 local symbols in non-VHE KVM namespace -/ __kvm_nvhe_\\$/d +/ __kvm_nvhe_\$/d / __kvm_nvhe_\.L/d # lld arm/aarch64/mips thunks diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 5b5745f00eb3..19ec72a69e90 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -132,7 +132,8 @@ struct devtable { * based at address m. */ #define DEF_FIELD(m, devid, f) \ - typeof(((struct devid *)0)->f) f = TO_NATIVE(*(typeof(f) *)((m) + OFF_##devid##_##f)) + typeof(((struct devid *)0)->f) f = \ + get_unaligned_native((typeof(f) *)((m) + OFF_##devid##_##f)) /* Define a variable f that holds the address of field f of struct devid * based at address m. Due to the way typeof works, for a field of type @@ -600,7 +601,7 @@ static void do_pnp_card_entry(struct module *mod, void *symval) static void do_pcmcia_entry(struct module *mod, void *symval) { char alias[256] = {}; - unsigned int i; + DEF_FIELD(symval, pcmcia_device_id, match_flags); DEF_FIELD(symval, pcmcia_device_id, manf_id); DEF_FIELD(symval, pcmcia_device_id, card_id); @@ -609,10 +610,6 @@ static void do_pcmcia_entry(struct module *mod, void *symval) DEF_FIELD(symval, pcmcia_device_id, device_no); DEF_FIELD_ADDR(symval, pcmcia_device_id, prod_id_hash); - for (i=0; i<4; i++) { - (*prod_id_hash)[i] = TO_NATIVE((*prod_id_hash)[i]); - } - ADD(alias, "m", match_flags & PCMCIA_DEV_ID_MATCH_MANF_ID, manf_id); ADD(alias, "c", match_flags & PCMCIA_DEV_ID_MATCH_CARD_ID, @@ -623,10 +620,14 @@ static void do_pcmcia_entry(struct module *mod, void *symval) function); ADD(alias, "pfn", match_flags & PCMCIA_DEV_ID_MATCH_DEVICE_NO, device_no); - ADD(alias, "pa", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID1, (*prod_id_hash)[0]); - ADD(alias, "pb", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID2, (*prod_id_hash)[1]); - ADD(alias, "pc", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID3, (*prod_id_hash)[2]); - ADD(alias, "pd", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID4, (*prod_id_hash)[3]); + ADD(alias, "pa", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID1, + get_unaligned_native(*prod_id_hash + 0)); + ADD(alias, "pb", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID2, + get_unaligned_native(*prod_id_hash + 1)); + ADD(alias, "pc", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID3, + get_unaligned_native(*prod_id_hash + 2)); + ADD(alias, "pd", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID4, + get_unaligned_native(*prod_id_hash + 3)); module_alias_printf(mod, true, "pcmcia:%s", alias); } @@ -654,10 +655,9 @@ static void do_input(char *alias, { unsigned int i; - for (i = min / BITS_PER_LONG; i < max / BITS_PER_LONG + 1; i++) - arr[i] = TO_NATIVE(arr[i]); - for (i = min; i < max; i++) - if (arr[i / BITS_PER_LONG] & (1ULL << (i%BITS_PER_LONG))) + for (i = min; i <= max; i++) + if (get_unaligned_native(arr + i / BITS_PER_LONG) & + (1ULL << (i % BITS_PER_LONG))) sprintf(alias + strlen(alias), "%X,*", i); } @@ -812,15 +812,13 @@ static void do_virtio_entry(struct module *mod, void *symval) * Each byte of the guid will be represented by two hex characters * in the name. */ - static void do_vmbus_entry(struct module *mod, void *symval) { - int i; DEF_FIELD_ADDR(symval, hv_vmbus_device_id, guid); - char guid_name[(sizeof(*guid) + 1) * 2]; + char guid_name[sizeof(*guid) * 2 + 1]; - for (i = 0; i < (sizeof(*guid) * 2); i += 2) - sprintf(&guid_name[i], "%02x", TO_NATIVE((guid->b)[i/2])); + for (int i = 0; i < sizeof(*guid); i++) + sprintf(&guid_name[i * 2], "%02x", guid->b[i]); module_alias_printf(mod, false, "vmbus:%s", guid_name); } diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 94ee49207a45..7ea59dc4926b 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -1138,9 +1138,9 @@ static Elf_Addr addend_386_rel(uint32_t *location, unsigned int r_type) { switch (r_type) { case R_386_32: - return TO_NATIVE(*location); + return get_unaligned_native(location); case R_386_PC32: - return TO_NATIVE(*location) + 4; + return get_unaligned_native(location) + 4; } return (Elf_Addr)(-1); @@ -1161,24 +1161,24 @@ static Elf_Addr addend_arm_rel(void *loc, Elf_Sym *sym, unsigned int r_type) switch (r_type) { case R_ARM_ABS32: case R_ARM_REL32: - inst = TO_NATIVE(*(uint32_t *)loc); + inst = get_unaligned_native((uint32_t *)loc); return inst + sym->st_value; case R_ARM_MOVW_ABS_NC: case R_ARM_MOVT_ABS: - inst = TO_NATIVE(*(uint32_t *)loc); + inst = get_unaligned_native((uint32_t *)loc); offset = sign_extend32(((inst & 0xf0000) >> 4) | (inst & 0xfff), 15); return offset + sym->st_value; case R_ARM_PC24: case R_ARM_CALL: case R_ARM_JUMP24: - inst = TO_NATIVE(*(uint32_t *)loc); + inst = get_unaligned_native((uint32_t *)loc); offset = sign_extend32((inst & 0x00ffffff) << 2, 25); return offset + sym->st_value + 8; case R_ARM_THM_MOVW_ABS_NC: case R_ARM_THM_MOVT_ABS: - upper = TO_NATIVE(*(uint16_t *)loc); - lower = TO_NATIVE(*((uint16_t *)loc + 1)); + upper = get_unaligned_native((uint16_t *)loc); + lower = get_unaligned_native((uint16_t *)loc + 1); offset = sign_extend32(((upper & 0x000f) << 12) | ((upper & 0x0400) << 1) | ((lower & 0x7000) >> 4) | @@ -1195,8 +1195,8 @@ static Elf_Addr addend_arm_rel(void *loc, Elf_Sym *sym, unsigned int r_type) * imm11 = lower[10:0] * imm32 = SignExtend(S:J2:J1:imm6:imm11:'0') */ - upper = TO_NATIVE(*(uint16_t *)loc); - lower = TO_NATIVE(*((uint16_t *)loc + 1)); + upper = get_unaligned_native((uint16_t *)loc); + lower = get_unaligned_native((uint16_t *)loc + 1); sign = (upper >> 10) & 1; j1 = (lower >> 13) & 1; @@ -1219,8 +1219,8 @@ static Elf_Addr addend_arm_rel(void *loc, Elf_Sym *sym, unsigned int r_type) * I2 = NOT(J2 XOR S) * imm32 = SignExtend(S:I1:I2:imm10:imm11:'0') */ - upper = TO_NATIVE(*(uint16_t *)loc); - lower = TO_NATIVE(*((uint16_t *)loc + 1)); + upper = get_unaligned_native((uint16_t *)loc); + lower = get_unaligned_native((uint16_t *)loc + 1); sign = (upper >> 10) & 1; j1 = (lower >> 13) & 1; @@ -1241,7 +1241,7 @@ static Elf_Addr addend_mips_rel(uint32_t *location, unsigned int r_type) { uint32_t inst; - inst = TO_NATIVE(*location); + inst = get_unaligned_native(location); switch (r_type) { case R_MIPS_LO16: return inst & 0xffff; diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index 8b72c227ebf4..ffd0a52a606e 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -65,6 +65,20 @@ #define TO_NATIVE(x) \ (target_is_big_endian == host_is_big_endian ? x : bswap(x)) +#define __get_unaligned_t(type, ptr) ({ \ + const struct { type x; } __attribute__((__packed__)) *__pptr = \ + (typeof(__pptr))(ptr); \ + __pptr->x; \ +}) + +#define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr)) + +#define get_unaligned_native(ptr) \ +({ \ + typeof(*(ptr)) _val = get_unaligned(ptr); \ + TO_NATIVE(_val); \ +}) + #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) #define strstarts(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0) diff --git a/scripts/package/PKGBUILD b/scripts/package/PKGBUILD index f83493838cf9..dca706617adc 100644 --- a/scripts/package/PKGBUILD +++ b/scripts/package/PKGBUILD @@ -103,7 +103,7 @@ _package-headers() { _package-api-headers() { pkgdesc="Kernel headers sanitized for use in userspace" - provides=(linux-api-headers) + provides=(linux-api-headers="${pkgver}") conflicts=(linux-api-headers) _prologue diff --git a/scripts/sorttable.h b/scripts/sorttable.h index 7bd0184380d3..a7c5445baf00 100644 --- a/scripts/sorttable.h +++ b/scripts/sorttable.h @@ -110,7 +110,7 @@ static inline unsigned long orc_ip(const int *ip) static int orc_sort_cmp(const void *_a, const void *_b) { - struct orc_entry *orc_a; + struct orc_entry *orc_a, *orc_b; const int *a = g_orc_ip_table + *(int *)_a; const int *b = g_orc_ip_table + *(int *)_b; unsigned long a_val = orc_ip(a); @@ -128,6 +128,9 @@ static int orc_sort_cmp(const void *_a, const void *_b) * whitelisted .o files which didn't get objtool generation. */ orc_a = g_orc_table + (a - g_orc_ip_table); + orc_b = g_orc_table + (b - g_orc_ip_table); + if (orc_a->type == ORC_TYPE_UNDEFINED && orc_b->type == ORC_TYPE_UNDEFINED) + return 0; return orc_a->type == ORC_TYPE_UNDEFINED ? -1 : 1; } diff --git a/security/selinux/avc.c b/security/selinux/avc.c index cc0b0af20296..1f2680bcc43a 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -174,13 +174,15 @@ int avc_get_hash_stats(char *page) * using a linked list for extended_perms_decision lookup because the list is * always small. i.e. less than 5, typically 1 */ -static struct extended_perms_decision *avc_xperms_decision_lookup(u8 driver, - struct avc_xperms_node *xp_node) +static struct extended_perms_decision * +avc_xperms_decision_lookup(u8 driver, u8 base_perm, + struct avc_xperms_node *xp_node) { struct avc_xperms_decision_node *xpd_node; list_for_each_entry(xpd_node, &xp_node->xpd_head, xpd_list) { - if (xpd_node->xpd.driver == driver) + if (xpd_node->xpd.driver == driver && + xpd_node->xpd.base_perm == base_perm) return &xpd_node->xpd; } return NULL; @@ -205,11 +207,12 @@ avc_xperms_has_perm(struct extended_perms_decision *xpd, } static void avc_xperms_allow_perm(struct avc_xperms_node *xp_node, - u8 driver, u8 perm) + u8 driver, u8 base_perm, u8 perm) { struct extended_perms_decision *xpd; security_xperm_set(xp_node->xp.drivers.p, driver); - xpd = avc_xperms_decision_lookup(driver, xp_node); + xp_node->xp.base_perms |= base_perm; + xpd = avc_xperms_decision_lookup(driver, base_perm, xp_node); if (xpd && xpd->allowed) security_xperm_set(xpd->allowed->p, perm); } @@ -245,6 +248,7 @@ static void avc_xperms_free(struct avc_xperms_node *xp_node) static void avc_copy_xperms_decision(struct extended_perms_decision *dest, struct extended_perms_decision *src) { + dest->base_perm = src->base_perm; dest->driver = src->driver; dest->used = src->used; if (dest->used & XPERMS_ALLOWED) @@ -272,6 +276,7 @@ static inline void avc_quick_copy_xperms_decision(u8 perm, */ u8 i = perm >> 5; + dest->base_perm = src->base_perm; dest->used = src->used; if (dest->used & XPERMS_ALLOWED) dest->allowed->p[i] = src->allowed->p[i]; @@ -357,6 +362,7 @@ static int avc_xperms_populate(struct avc_node *node, memcpy(dest->xp.drivers.p, src->xp.drivers.p, sizeof(dest->xp.drivers.p)); dest->xp.len = src->xp.len; + dest->xp.base_perms = src->xp.base_perms; /* for each source xpd allocate a destination xpd and copy */ list_for_each_entry(src_xpd, &src->xpd_head, xpd_list) { @@ -807,6 +813,7 @@ out: * @event : Updating event * @perms : Permission mask bits * @driver: xperm driver information + * @base_perm: the base permission associated with the extended permission * @xperm: xperm permissions * @ssid: AVC entry source sid * @tsid: AVC entry target sid @@ -820,10 +827,9 @@ out: * otherwise, this function updates the AVC entry. The original AVC-entry object * will release later by RCU. */ -static int avc_update_node(u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid, - u32 tsid, u16 tclass, u32 seqno, - struct extended_perms_decision *xpd, - u32 flags) +static int avc_update_node(u32 event, u32 perms, u8 driver, u8 base_perm, + u8 xperm, u32 ssid, u32 tsid, u16 tclass, u32 seqno, + struct extended_perms_decision *xpd, u32 flags) { u32 hvalue; int rc = 0; @@ -880,7 +886,7 @@ static int avc_update_node(u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid, case AVC_CALLBACK_GRANT: node->ae.avd.allowed |= perms; if (node->ae.xp_node && (flags & AVC_EXTENDED_PERMS)) - avc_xperms_allow_perm(node->ae.xp_node, driver, xperm); + avc_xperms_allow_perm(node->ae.xp_node, driver, base_perm, xperm); break; case AVC_CALLBACK_TRY_REVOKE: case AVC_CALLBACK_REVOKE: @@ -987,10 +993,9 @@ static noinline void avc_compute_av(u32 ssid, u32 tsid, u16 tclass, avc_insert(ssid, tsid, tclass, avd, xp_node); } -static noinline int avc_denied(u32 ssid, u32 tsid, - u16 tclass, u32 requested, - u8 driver, u8 xperm, unsigned int flags, - struct av_decision *avd) +static noinline int avc_denied(u32 ssid, u32 tsid, u16 tclass, u32 requested, + u8 driver, u8 base_perm, u8 xperm, + unsigned int flags, struct av_decision *avd) { if (flags & AVC_STRICT) return -EACCES; @@ -999,7 +1004,7 @@ static noinline int avc_denied(u32 ssid, u32 tsid, !(avd->flags & AVD_FLAGS_PERMISSIVE)) return -EACCES; - avc_update_node(AVC_CALLBACK_GRANT, requested, driver, + avc_update_node(AVC_CALLBACK_GRANT, requested, driver, base_perm, xperm, ssid, tsid, tclass, avd->seqno, NULL, flags); return 0; } @@ -1012,7 +1017,8 @@ static noinline int avc_denied(u32 ssid, u32 tsid, * driver field is used to specify which set contains the permission. */ int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested, - u8 driver, u8 xperm, struct common_audit_data *ad) + u8 driver, u8 base_perm, u8 xperm, + struct common_audit_data *ad) { struct avc_node *node; struct av_decision avd; @@ -1047,22 +1053,23 @@ int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested, local_xpd.auditallow = &auditallow; local_xpd.dontaudit = &dontaudit; - xpd = avc_xperms_decision_lookup(driver, xp_node); + xpd = avc_xperms_decision_lookup(driver, base_perm, xp_node); if (unlikely(!xpd)) { /* * Compute the extended_perms_decision only if the driver - * is flagged + * is flagged and the base permission is known. */ - if (!security_xperm_test(xp_node->xp.drivers.p, driver)) { + if (!security_xperm_test(xp_node->xp.drivers.p, driver) || + !(xp_node->xp.base_perms & base_perm)) { avd.allowed &= ~requested; goto decision; } rcu_read_unlock(); - security_compute_xperms_decision(ssid, tsid, tclass, - driver, &local_xpd); + security_compute_xperms_decision(ssid, tsid, tclass, driver, + base_perm, &local_xpd); rcu_read_lock(); - avc_update_node(AVC_CALLBACK_ADD_XPERMS, requested, - driver, xperm, ssid, tsid, tclass, avd.seqno, + avc_update_node(AVC_CALLBACK_ADD_XPERMS, requested, driver, + base_perm, xperm, ssid, tsid, tclass, avd.seqno, &local_xpd, 0); } else { avc_quick_copy_xperms_decision(xperm, &local_xpd, xpd); @@ -1075,8 +1082,8 @@ int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested, decision: denied = requested & ~(avd.allowed); if (unlikely(denied)) - rc = avc_denied(ssid, tsid, tclass, requested, - driver, xperm, AVC_EXTENDED_PERMS, &avd); + rc = avc_denied(ssid, tsid, tclass, requested, driver, + base_perm, xperm, AVC_EXTENDED_PERMS, &avd); rcu_read_unlock(); @@ -1110,7 +1117,7 @@ static noinline int avc_perm_nonode(u32 ssid, u32 tsid, u16 tclass, avc_compute_av(ssid, tsid, tclass, avd, &xp_node); denied = requested & ~(avd->allowed); if (unlikely(denied)) - return avc_denied(ssid, tsid, tclass, requested, 0, 0, + return avc_denied(ssid, tsid, tclass, requested, 0, 0, 0, flags, avd); return 0; } @@ -1158,7 +1165,7 @@ inline int avc_has_perm_noaudit(u32 ssid, u32 tsid, rcu_read_unlock(); if (unlikely(denied)) - return avc_denied(ssid, tsid, tclass, requested, 0, 0, + return avc_denied(ssid, tsid, tclass, requested, 0, 0, 0, flags, avd); return 0; } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 366c87a40bd1..171dd7fceac5 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3688,8 +3688,8 @@ static int ioctl_has_perm(const struct cred *cred, struct file *file, return 0; isec = inode_security(inode); - rc = avc_has_extended_perms(ssid, isec->sid, isec->sclass, - requested, driver, xperm, &ad); + rc = avc_has_extended_perms(ssid, isec->sid, isec->sclass, requested, + driver, AVC_EXT_IOCTL, xperm, &ad); out: return rc; } @@ -5952,7 +5952,7 @@ static int nlmsg_sock_has_extended_perms(struct sock *sk, u32 perms, u16 nlmsg_t xperm = nlmsg_type & 0xff; return avc_has_extended_perms(current_sid(), sksec->sid, sksec->sclass, - perms, driver, xperm, &ad); + perms, driver, AVC_EXT_NLMSG, xperm, &ad); } static int selinux_netlink_send(struct sock *sk, struct sk_buff *skb) diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h index 96a614d47df8..281f40103663 100644 --- a/security/selinux/include/avc.h +++ b/security/selinux/include/avc.h @@ -136,8 +136,11 @@ int avc_has_perm_noaudit(u32 ssid, u32 tsid, u16 tclass, u32 requested, int avc_has_perm(u32 ssid, u32 tsid, u16 tclass, u32 requested, struct common_audit_data *auditdata); +#define AVC_EXT_IOCTL (1 << 0) /* Cache entry for an ioctl extended permission */ +#define AVC_EXT_NLMSG (1 << 1) /* Cache entry for an nlmsg extended permission */ int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested, - u8 driver, u8 perm, struct common_audit_data *ad); + u8 driver, u8 base_perm, u8 perm, + struct common_audit_data *ad); u32 avc_policy_seqno(void); diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index c7f2731abd03..700bd6c8bb38 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -239,6 +239,7 @@ struct extended_perms_data { struct extended_perms_decision { u8 used; u8 driver; + u8 base_perm; struct extended_perms_data *allowed; struct extended_perms_data *auditallow; struct extended_perms_data *dontaudit; @@ -246,6 +247,7 @@ struct extended_perms_decision { struct extended_perms { u16 len; /* length associated decision chain */ + u8 base_perms; /* which base permissions are covered */ struct extended_perms_data drivers; /* flag drivers that are used */ }; @@ -257,6 +259,7 @@ void security_compute_av(u32 ssid, u32 tsid, u16 tclass, struct extended_perms *xperms); void security_compute_xperms_decision(u32 ssid, u32 tsid, u16 tclass, u8 driver, + u8 base_perm, struct extended_perms_decision *xpermd); void security_compute_av_user(u32 ssid, u32 tsid, u16 tclass, diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 3d5c563cfc4c..d9f58b5d0f49 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -582,7 +582,7 @@ static void type_attribute_bounds_av(struct policydb *policydb, } /* - * Flag which drivers have permissions. + * Flag which drivers have permissions and which base permissions are covered. */ void services_compute_xperms_drivers( struct extended_perms *xperms, @@ -592,12 +592,19 @@ void services_compute_xperms_drivers( switch (node->datum.u.xperms->specified) { case AVTAB_XPERMS_IOCTLDRIVER: + xperms->base_perms |= AVC_EXT_IOCTL; /* if one or more driver has all permissions allowed */ for (i = 0; i < ARRAY_SIZE(xperms->drivers.p); i++) xperms->drivers.p[i] |= node->datum.u.xperms->perms.p[i]; break; case AVTAB_XPERMS_IOCTLFUNCTION: + xperms->base_perms |= AVC_EXT_IOCTL; + /* if allowing permissions within a driver */ + security_xperm_set(xperms->drivers.p, + node->datum.u.xperms->driver); + break; case AVTAB_XPERMS_NLMSG: + xperms->base_perms |= AVC_EXT_NLMSG; /* if allowing permissions within a driver */ security_xperm_set(xperms->drivers.p, node->datum.u.xperms->driver); @@ -631,8 +638,7 @@ static void context_struct_compute_av(struct policydb *policydb, avd->auditallow = 0; avd->auditdeny = 0xffffffff; if (xperms) { - memset(&xperms->drivers, 0, sizeof(xperms->drivers)); - xperms->len = 0; + memset(xperms, 0, sizeof(*xperms)); } if (unlikely(!tclass || tclass > policydb->p_classes.nprim)) { @@ -969,13 +975,19 @@ void services_compute_xperms_decision(struct extended_perms_decision *xpermd, { switch (node->datum.u.xperms->specified) { case AVTAB_XPERMS_IOCTLFUNCTION: - case AVTAB_XPERMS_NLMSG: - if (xpermd->driver != node->datum.u.xperms->driver) + if (xpermd->base_perm != AVC_EXT_IOCTL || + xpermd->driver != node->datum.u.xperms->driver) return; break; case AVTAB_XPERMS_IOCTLDRIVER: - if (!security_xperm_test(node->datum.u.xperms->perms.p, - xpermd->driver)) + if (xpermd->base_perm != AVC_EXT_IOCTL || + !security_xperm_test(node->datum.u.xperms->perms.p, + xpermd->driver)) + return; + break; + case AVTAB_XPERMS_NLMSG: + if (xpermd->base_perm != AVC_EXT_NLMSG || + xpermd->driver != node->datum.u.xperms->driver) return; break; default: @@ -1010,6 +1022,7 @@ void security_compute_xperms_decision(u32 ssid, u32 tsid, u16 orig_tclass, u8 driver, + u8 base_perm, struct extended_perms_decision *xpermd) { struct selinux_policy *policy; @@ -1023,6 +1036,7 @@ void security_compute_xperms_decision(u32 ssid, struct ebitmap_node *snode, *tnode; unsigned int i, j; + xpermd->base_perm = base_perm; xpermd->driver = driver; xpermd->used = 0; memset(xpermd->allowed->p, 0, sizeof(xpermd->allowed->p)); diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 8516c1ccd57a..7e46ca4cd31b 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -1315,6 +1315,8 @@ enum { IFLA_NETKIT_MODE, IFLA_NETKIT_SCRUB, IFLA_NETKIT_PEER_SCRUB, + IFLA_NETKIT_HEADROOM, + IFLA_NETKIT_TAILROOM, __IFLA_NETKIT_MAX, }; #define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) diff --git a/tools/net/ynl/Makefile b/tools/net/ynl/Makefile index d1cdf2a8f826..211df5a93ad9 100644 --- a/tools/net/ynl/Makefile +++ b/tools/net/ynl/Makefile @@ -1,5 +1,17 @@ # SPDX-License-Identifier: GPL-2.0 +include ../../scripts/Makefile.arch + +INSTALL ?= install +prefix ?= /usr +ifeq ($(LP64), 1) + libdir_relative = lib64 +else + libdir_relative = lib +endif +libdir ?= $(prefix)/$(libdir_relative) +includedir ?= $(prefix)/include + SUBDIRS = lib generated samples all: $(SUBDIRS) libynl.a @@ -21,5 +33,20 @@ clean distclean: fi \ done rm -f libynl.a + rm -rf pyynl/__pycache__ + rm -rf pyynl/lib/__pycache__ + rm -rf pyynl.egg-info + rm -rf build + +install: libynl.a lib/*.h + @echo -e "\tINSTALL libynl.a" + @$(INSTALL) -d $(DESTDIR)$(libdir) + @$(INSTALL) -m 0644 libynl.a $(DESTDIR)$(libdir)/libynl.a + @echo -e "\tINSTALL libynl headers" + @$(INSTALL) -d $(DESTDIR)$(includedir)/ynl + @$(INSTALL) -m 0644 lib/*.h $(DESTDIR)$(includedir)/ynl/ + @echo -e "\tINSTALL pyynl" + @pip install --prefix=$(DESTDIR)$(prefix) . + @make -C generated install -.PHONY: all clean distclean $(SUBDIRS) +.PHONY: all clean distclean install $(SUBDIRS) diff --git a/tools/net/ynl/generated/.gitignore b/tools/net/ynl/generated/.gitignore index ade488626d26..859a6fb446e1 100644 --- a/tools/net/ynl/generated/.gitignore +++ b/tools/net/ynl/generated/.gitignore @@ -1,2 +1,3 @@ *-user.c *-user.h +*.rst diff --git a/tools/net/ynl/generated/Makefile b/tools/net/ynl/generated/Makefile index 7db5240de58a..21f9e299dc75 100644 --- a/tools/net/ynl/generated/Makefile +++ b/tools/net/ynl/generated/Makefile @@ -7,32 +7,44 @@ ifeq ("$(DEBUG)","1") CFLAGS += -g -fsanitize=address -fsanitize=leak -static-libasan endif +INSTALL ?= install +prefix ?= /usr +datarootdir ?= $(prefix)/share +docdir ?= $(datarootdir)/doc +includedir ?= $(prefix)/include + include ../Makefile.deps YNL_GEN_ARG_ethtool:=--user-header linux/ethtool_netlink.h \ --exclude-op stats-get -TOOL:=../ynl-gen-c.py +TOOL:=../pyynl/ynl_gen_c.py +TOOL_RST:=../pyynl/ynl_gen_rst.py +SPECS_DIR:=../../../../Documentation/netlink/specs GENS_PATHS=$(shell grep -nrI --files-without-match \ 'protocol: netlink' \ - ../../../../Documentation/netlink/specs/) -GENS=$(patsubst ../../../../Documentation/netlink/specs/%.yaml,%,${GENS_PATHS}) + $(SPECS_DIR)) +GENS=$(patsubst $(SPECS_DIR)/%.yaml,%,${GENS_PATHS}) SRCS=$(patsubst %,%-user.c,${GENS}) HDRS=$(patsubst %,%-user.h,${GENS}) OBJS=$(patsubst %,%-user.o,${GENS}) -all: protos.a $(HDRS) $(SRCS) $(KHDRS) $(KSRCS) $(UAPI) +SPECS_PATHS=$(wildcard $(SPECS_DIR)/*.yaml) +SPECS=$(patsubst $(SPECS_DIR)/%.yaml,%,${SPECS_PATHS}) +RSTS=$(patsubst %,%.rst,${SPECS}) + +all: protos.a $(HDRS) $(SRCS) $(KHDRS) $(KSRCS) $(UAPI) $(RSTS) protos.a: $(OBJS) @echo -e "\tAR $@" @ar rcs $@ $(OBJS) -%-user.h: ../../../../Documentation/netlink/specs/%.yaml $(TOOL) +%-user.h: $(SPECS_DIR)/%.yaml $(TOOL) @echo -e "\tGEN $@" @$(TOOL) --mode user --header --spec $< -o $@ $(YNL_GEN_ARG_$*) -%-user.c: ../../../../Documentation/netlink/specs/%.yaml $(TOOL) +%-user.c: $(SPECS_DIR)/%.yaml $(TOOL) @echo -e "\tGEN $@" @$(TOOL) --mode user --source --spec $< -o $@ $(YNL_GEN_ARG_$*) @@ -40,14 +52,37 @@ protos.a: $(OBJS) @echo -e "\tCC $@" @$(COMPILE.c) $(CFLAGS_$*) -o $@ $< +%.rst: $(SPECS_DIR)/%.yaml $(TOOL_RST) + @echo -e "\tGEN_RST $@" + @$(TOOL_RST) -o $@ -i $< + clean: rm -f *.o distclean: clean - rm -f *.c *.h *.a + rm -f *.c *.h *.a *.rst regen: @../ynl-regen.sh -.PHONY: all clean distclean regen +install-headers: $(HDRS) + @echo -e "\tINSTALL generated headers" + @$(INSTALL) -d $(DESTDIR)$(includedir)/ynl + @$(INSTALL) -m 0644 *.h $(DESTDIR)$(includedir)/ynl/ + +install-rsts: $(RSTS) + @echo -e "\tINSTALL generated docs" + @$(INSTALL) -d $(DESTDIR)$(docdir)/ynl + @$(INSTALL) -m 0644 $(RSTS) $(DESTDIR)$(docdir)/ynl/ + +install-specs: + @echo -e "\tINSTALL specs" + @$(INSTALL) -d $(DESTDIR)$(datarootdir)/ynl + @$(INSTALL) -m 0644 ../../../../Documentation/netlink/*.yaml $(DESTDIR)$(datarootdir)/ynl/ + @$(INSTALL) -d $(DESTDIR)$(datarootdir)/ynl/specs + @$(INSTALL) -m 0644 $(SPECS_DIR)/*.yaml $(DESTDIR)$(datarootdir)/ynl/specs/ + +install: install-headers install-rsts install-specs + +.PHONY: all clean distclean regen install install-headers install-rsts install-specs .DEFAULT_GOAL: all diff --git a/tools/net/ynl/lib/.gitignore b/tools/net/ynl/lib/.gitignore index 296c4035dbf2..a4383358ec72 100644 --- a/tools/net/ynl/lib/.gitignore +++ b/tools/net/ynl/lib/.gitignore @@ -1,2 +1 @@ -__pycache__/ *.d diff --git a/tools/net/ynl/lib/Makefile b/tools/net/ynl/lib/Makefile index 94c49cca3dca..4b2b98704ff9 100644 --- a/tools/net/ynl/lib/Makefile +++ b/tools/net/ynl/lib/Makefile @@ -19,7 +19,6 @@ ynl.a: $(OBJS) clean: rm -f *.o *.d *~ - rm -rf __pycache__ distclean: clean rm -f *.a diff --git a/tools/net/ynl/pyproject.toml b/tools/net/ynl/pyproject.toml new file mode 100644 index 000000000000..a81d8779b0e0 --- /dev/null +++ b/tools/net/ynl/pyproject.toml @@ -0,0 +1,24 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "pyynl" +authors = [ + {name = "Donald Hunter", email = "donald.hunter@gmail.com"}, + {name = "Jakub Kicinski", email = "kuba@kernel.org"}, +] +description = "yaml netlink (ynl)" +version = "0.0.1" +requires-python = ">=3.9" +dependencies = [ + "pyyaml==6.*", + "jsonschema==4.*" +] + +[tool.setuptools.packages.find] +include = ["pyynl", "pyynl.lib"] + +[project.scripts] +ynl = "pyynl.cli:main" +ynl-ethtool = "pyynl.ethtool:main" diff --git a/tools/net/ynl/pyynl/.gitignore b/tools/net/ynl/pyynl/.gitignore new file mode 100644 index 000000000000..b801cd2d016e --- /dev/null +++ b/tools/net/ynl/pyynl/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +lib/__pycache__/ diff --git a/tools/net/ynl/pyynl/__init__.py b/tools/net/ynl/pyynl/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 --- /dev/null +++ b/tools/net/ynl/pyynl/__init__.py diff --git a/tools/net/ynl/cli.py b/tools/net/ynl/pyynl/cli.py index 41d9fa5c818d..794e3c7dcc65 100755 --- a/tools/net/ynl/cli.py +++ b/tools/net/ynl/pyynl/cli.py @@ -3,6 +3,7 @@ import argparse import json +import os import pathlib import pprint import sys @@ -10,6 +11,24 @@ import sys sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix()) from lib import YnlFamily, Netlink, NlError +sys_schema_dir='/usr/share/ynl' +relative_schema_dir='../../../../Documentation/netlink' + +def schema_dir(): + script_dir = os.path.dirname(os.path.abspath(__file__)) + schema_dir = os.path.abspath(f"{script_dir}/{relative_schema_dir}") + if not os.path.isdir(schema_dir): + schema_dir = sys_schema_dir + if not os.path.isdir(schema_dir): + raise Exception(f"Schema directory {schema_dir} does not exist") + return schema_dir + +def spec_dir(): + spec_dir = schema_dir() + '/specs' + if not os.path.isdir(spec_dir): + raise Exception(f"Spec directory {spec_dir} does not exist") + return spec_dir + class YnlEncoder(json.JSONEncoder): def default(self, obj): @@ -32,7 +51,14 @@ def main(): parser = argparse.ArgumentParser(description=description, epilog=epilog) - parser.add_argument('--spec', dest='spec', type=str, required=True) + spec_group = parser.add_mutually_exclusive_group(required=True) + spec_group.add_argument('--family', dest='family', type=str, + help='name of the netlink FAMILY') + spec_group.add_argument('--list-families', action='store_true', + help='list all netlink families supported by YNL (has spec)') + spec_group.add_argument('--spec', dest='spec', type=str, + help='choose the family by SPEC file path') + parser.add_argument('--schema', dest='schema', type=str) parser.add_argument('--no-schema', action='store_true') parser.add_argument('--json', dest='json_text', type=str) @@ -70,6 +96,12 @@ def main(): else: pprint.PrettyPrinter().pprint(msg) + if args.list_families: + for filename in sorted(os.listdir(spec_dir())): + if filename.endswith('.yaml'): + print(filename.removesuffix('.yaml')) + return + if args.no_schema: args.schema = '' @@ -77,7 +109,16 @@ def main(): if args.json_text: attrs = json.loads(args.json_text) - ynl = YnlFamily(args.spec, args.schema, args.process_unknown, + if args.family: + spec = f"{spec_dir()}/{args.family}.yaml" + if args.schema is None and spec.startswith(sys_schema_dir): + args.schema = '' # disable schema validation when installed + else: + spec = args.spec + if not os.path.isfile(spec): + raise Exception(f"Spec file {spec} does not exist") + + ynl = YnlFamily(spec, args.schema, args.process_unknown, recv_size=args.dbg_small_recv) if args.dbg_small_recv: ynl.set_recv_dbg(True) diff --git a/tools/net/ynl/ethtool.py b/tools/net/ynl/pyynl/ethtool.py index ebb0a11f67bf..af7fddd7b085 100755 --- a/tools/net/ynl/ethtool.py +++ b/tools/net/ynl/pyynl/ethtool.py @@ -11,6 +11,7 @@ import os sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix()) from lib import YnlFamily +from cli import schema_dir, spec_dir def args_to_req(ynl, op_name, args, req): """ @@ -156,10 +157,8 @@ def main(): args = parser.parse_args() script_abs_dir = os.path.dirname(os.path.abspath(sys.argv[0])) - spec = os.path.join(script_abs_dir, - '../../../Documentation/netlink/specs/ethtool.yaml') - schema = os.path.join(script_abs_dir, - '../../../Documentation/netlink/genetlink-legacy.yaml') + spec = os.path.join(spec_dir(), 'ethtool.yaml') + schema = os.path.join(schema_dir(), 'genetlink-legacy.yaml') ynl = YnlFamily(spec, schema) diff --git a/tools/net/ynl/lib/__init__.py b/tools/net/ynl/pyynl/lib/__init__.py index 9137b83e580a..9137b83e580a 100644 --- a/tools/net/ynl/lib/__init__.py +++ b/tools/net/ynl/pyynl/lib/__init__.py diff --git a/tools/net/ynl/lib/nlspec.py b/tools/net/ynl/pyynl/lib/nlspec.py index a745739655ad..314ec8007496 100644 --- a/tools/net/ynl/lib/nlspec.py +++ b/tools/net/ynl/pyynl/lib/nlspec.py @@ -219,7 +219,10 @@ class SpecAttrSet(SpecElement): else: real_set = family.attr_sets[self.subset_of] for elem in self.yaml['attributes']: - attr = real_set[elem['name']] + real_attr = real_set[elem['name']] + combined_elem = real_attr.yaml | elem + attr = self.new_attr(combined_elem, real_attr.value) + self.attrs[attr.name] = attr self.attrs_by_val[attr.value] = attr diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/pyynl/lib/ynl.py index eea29359a899..08f8bf89cfc2 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/pyynl/lib/ynl.py @@ -733,41 +733,45 @@ class YnlFamily(SpecFamily): self._rsp_add(rsp, attr_name, None, self._decode_unknown(attr)) continue - if attr_spec["type"] == 'nest': - subdict = self._decode(NlAttrs(attr.raw), attr_spec['nested-attributes'], search_attrs) - decoded = subdict - elif attr_spec["type"] == 'string': - decoded = attr.as_strz() - elif attr_spec["type"] == 'binary': - decoded = self._decode_binary(attr, attr_spec) - elif attr_spec["type"] == 'flag': - decoded = True - elif attr_spec.is_auto_scalar: - decoded = attr.as_auto_scalar(attr_spec['type'], attr_spec.byte_order) - elif attr_spec["type"] in NlAttr.type_formats: - decoded = attr.as_scalar(attr_spec['type'], attr_spec.byte_order) - if 'enum' in attr_spec: - decoded = self._decode_enum(decoded, attr_spec) - elif attr_spec.display_hint: - decoded = self._formatted_string(decoded, attr_spec.display_hint) - elif attr_spec["type"] == 'indexed-array': - decoded = self._decode_array_attr(attr, attr_spec) - elif attr_spec["type"] == 'bitfield32': - value, selector = struct.unpack("II", attr.raw) - if 'enum' in attr_spec: - value = self._decode_enum(value, attr_spec) - selector = self._decode_enum(selector, attr_spec) - decoded = {"value": value, "selector": selector} - elif attr_spec["type"] == 'sub-message': - decoded = self._decode_sub_msg(attr, attr_spec, search_attrs) - elif attr_spec["type"] == 'nest-type-value': - decoded = self._decode_nest_type_value(attr, attr_spec) - else: - if not self.process_unknown: - raise Exception(f'Unknown {attr_spec["type"]} with name {attr_spec["name"]}') - decoded = self._decode_unknown(attr) - - self._rsp_add(rsp, attr_spec["name"], attr_spec.is_multi, decoded) + try: + if attr_spec["type"] == 'nest': + subdict = self._decode(NlAttrs(attr.raw), attr_spec['nested-attributes'], search_attrs) + decoded = subdict + elif attr_spec["type"] == 'string': + decoded = attr.as_strz() + elif attr_spec["type"] == 'binary': + decoded = self._decode_binary(attr, attr_spec) + elif attr_spec["type"] == 'flag': + decoded = True + elif attr_spec.is_auto_scalar: + decoded = attr.as_auto_scalar(attr_spec['type'], attr_spec.byte_order) + elif attr_spec["type"] in NlAttr.type_formats: + decoded = attr.as_scalar(attr_spec['type'], attr_spec.byte_order) + if 'enum' in attr_spec: + decoded = self._decode_enum(decoded, attr_spec) + elif attr_spec.display_hint: + decoded = self._formatted_string(decoded, attr_spec.display_hint) + elif attr_spec["type"] == 'indexed-array': + decoded = self._decode_array_attr(attr, attr_spec) + elif attr_spec["type"] == 'bitfield32': + value, selector = struct.unpack("II", attr.raw) + if 'enum' in attr_spec: + value = self._decode_enum(value, attr_spec) + selector = self._decode_enum(selector, attr_spec) + decoded = {"value": value, "selector": selector} + elif attr_spec["type"] == 'sub-message': + decoded = self._decode_sub_msg(attr, attr_spec, search_attrs) + elif attr_spec["type"] == 'nest-type-value': + decoded = self._decode_nest_type_value(attr, attr_spec) + else: + if not self.process_unknown: + raise Exception(f'Unknown {attr_spec["type"]} with name {attr_spec["name"]}') + decoded = self._decode_unknown(attr) + + self._rsp_add(rsp, attr_spec["name"], attr_spec.is_multi, decoded) + except: + print(f"Error decoding '{attr_spec.name}' from '{space}'") + raise return rsp diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index ec2288948795..d3a7dfbcf929 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -79,6 +79,20 @@ class Type(SpecAttr): self.enum_name = None delattr(self, "enum_name") + def _get_real_attr(self): + # if the attr is for a subset return the "real" attr (just one down, does not recurse) + return self.family.attr_sets[self.attr_set.subset_of][self.name] + + def set_request(self): + self.request = True + if self.attr_set.subset_of: + self._get_real_attr().set_request() + + def set_reply(self): + self.reply = True + if self.attr_set.subset_of: + self._get_real_attr().set_reply() + def get_limit(self, limit, default=None): value = self.checks.get(limit, default) if value is None: @@ -106,6 +120,10 @@ class Type(SpecAttr): enum_name = f"{self.attr_set.name_prefix}{self.name}" self.enum_name = c_upper(enum_name) + if self.attr_set.subset_of: + if self.checks != self._get_real_attr().checks: + raise Exception("Overriding checks not supported by codegen, yet") + def is_multi_val(self): return None @@ -1119,17 +1137,17 @@ class Family(SpecFamily): for _, struct in self.pure_nested_structs.items(): if struct.request: for _, arg in struct.member_list(): - arg.request = True + arg.set_request() if struct.reply: for _, arg in struct.member_list(): - arg.reply = True + arg.set_reply() for root_set, rs_members in self.root_sets.items(): for attr, spec in self.attr_sets[root_set].items(): if attr in rs_members['request']: - spec.request = True + spec.set_request() if attr in rs_members['reply']: - spec.reply = True + spec.set_reply() def _load_global_policy(self): global_set = set() @@ -1765,7 +1783,14 @@ def parse_rsp_nested(ri, struct): f'{struct.ptr_name}dst = yarg->data;'] init_lines = [] - _multi_parse(ri, struct, init_lines, local_vars) + if struct.member_list(): + _multi_parse(ri, struct, init_lines, local_vars) + else: + # Empty nest + ri.cw.block_start() + ri.cw.p('return 0;') + ri.cw.block_end() + ri.cw.nl() def parse_rsp_msg(ri, deref=False): @@ -2592,7 +2617,8 @@ def render_uapi(family, cw): val = attr.value val += 1 cw.p(attr.enum_name + suffix) - cw.nl() + if attr_set.items(): + cw.nl() cw.p(attr_set.cnt_name + ('' if max_by_define else ',')) if not max_by_define: cw.p(f"{attr_set.max_name} = {max_value}") diff --git a/tools/net/ynl/ynl-gen-rst.py b/tools/net/ynl/pyynl/ynl_gen_rst.py index 6c56d0d726b4..6c56d0d726b4 100755 --- a/tools/net/ynl/ynl-gen-rst.py +++ b/tools/net/ynl/pyynl/ynl_gen_rst.py diff --git a/tools/net/ynl/ynl-regen.sh b/tools/net/ynl/ynl-regen.sh index a37304dcc88e..81b4ecd89100 100755 --- a/tools/net/ynl/ynl-regen.sh +++ b/tools/net/ynl/ynl-regen.sh @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause -TOOL=$(dirname $(realpath $0))/ynl-gen-c.py +TOOL=$(dirname $(realpath $0))/pyynl/ynl_gen_c.py force= search= diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 2f36b7b6418d..625f5b046776 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -40,9 +40,9 @@ void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_fl void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; u32 scx_bpf_dispatch_nr_slots(void) __ksym; void scx_bpf_dispatch_cancel(void) __ksym; -bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym; -void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym; -void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym; +bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym __weak; +void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; +void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; u32 scx_bpf_reenqueue_local(void) __ksym; diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c index 21deea320bd7..e938156ed0a0 100644 --- a/tools/sched_ext/scx_central.c +++ b/tools/sched_ext/scx_central.c @@ -97,7 +97,7 @@ restart: SCX_BUG_ON(!cpuset, "Failed to allocate cpuset"); CPU_ZERO(cpuset); CPU_SET(skel->rodata->central_cpu, cpuset); - SCX_BUG_ON(sched_setaffinity(0, sizeof(cpuset), cpuset), + SCX_BUG_ON(sched_setaffinity(0, sizeof(*cpuset), cpuset), "Failed to affinitize to central CPU %d (max %d)", skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1); CPU_FREE(cpuset); diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 7eeb3cbe18c7..0a016cd71cba 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -129,7 +129,6 @@ TEST_FILES = xsk_prereqs.sh $(wildcard progs/btf_dump_test_case_*.c) TEST_PROGS := test_kmod.sh \ test_xdp_redirect.sh \ test_xdp_redirect_multi.sh \ - test_xdp_meta.sh \ test_tunnel.sh \ test_lwt_seg6local.sh \ test_lirc_mode2.sh \ diff --git a/tools/testing/selftests/bpf/prog_tests/tc_netkit.c b/tools/testing/selftests/bpf/prog_tests/tc_netkit.c index 151a4210028f..2461d183dee5 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_netkit.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_netkit.c @@ -14,10 +14,16 @@ #include "netlink_helpers.h" #include "tc_helpers.h" +#define NETKIT_HEADROOM 32 +#define NETKIT_TAILROOM 8 + #define MARK 42 #define PRIO 0xeb9f #define ICMP_ECHO 8 +#define FLAG_ADJUST_ROOM (1 << 0) +#define FLAG_SAME_NETNS (1 << 1) + struct icmphdr { __u8 type; __u8 code; @@ -35,7 +41,7 @@ struct iplink_req { }; static int create_netkit(int mode, int policy, int peer_policy, int *ifindex, - bool same_netns, int scrub, int peer_scrub) + int scrub, int peer_scrub, __u32 flags) { struct rtnl_handle rth = { .fd = -1 }; struct iplink_req req = {}; @@ -63,6 +69,10 @@ static int create_netkit(int mode, int policy, int peer_policy, int *ifindex, addattr32(&req.n, sizeof(req), IFLA_NETKIT_SCRUB, scrub); addattr32(&req.n, sizeof(req), IFLA_NETKIT_PEER_SCRUB, peer_scrub); addattr32(&req.n, sizeof(req), IFLA_NETKIT_MODE, mode); + if (flags & FLAG_ADJUST_ROOM) { + addattr16(&req.n, sizeof(req), IFLA_NETKIT_HEADROOM, NETKIT_HEADROOM); + addattr16(&req.n, sizeof(req), IFLA_NETKIT_TAILROOM, NETKIT_TAILROOM); + } addattr_nest_end(&req.n, data); addattr_nest_end(&req.n, linkinfo); @@ -87,7 +97,7 @@ static int create_netkit(int mode, int policy, int peer_policy, int *ifindex, " addr ee:ff:bb:cc:aa:dd"), "set hwaddress"); } - if (same_netns) { + if (flags & FLAG_SAME_NETNS) { ASSERT_OK(system("ip link set dev " netkit_peer " up"), "up peer"); ASSERT_OK(system("ip addr add dev " netkit_peer " 10.0.0.2/24"), @@ -184,8 +194,8 @@ void serial_test_tc_netkit_basic(void) int err, ifindex; err = create_netkit(NETKIT_L2, NETKIT_PASS, NETKIT_PASS, - &ifindex, false, NETKIT_SCRUB_DEFAULT, - NETKIT_SCRUB_DEFAULT); + &ifindex, NETKIT_SCRUB_DEFAULT, + NETKIT_SCRUB_DEFAULT, 0); if (err) return; @@ -299,8 +309,8 @@ static void serial_test_tc_netkit_multi_links_target(int mode, int target) int err, ifindex; err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS, - &ifindex, false, NETKIT_SCRUB_DEFAULT, - NETKIT_SCRUB_DEFAULT); + &ifindex, NETKIT_SCRUB_DEFAULT, + NETKIT_SCRUB_DEFAULT, 0); if (err) return; @@ -428,8 +438,8 @@ static void serial_test_tc_netkit_multi_opts_target(int mode, int target) int err, ifindex; err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS, - &ifindex, false, NETKIT_SCRUB_DEFAULT, - NETKIT_SCRUB_DEFAULT); + &ifindex, NETKIT_SCRUB_DEFAULT, + NETKIT_SCRUB_DEFAULT, 0); if (err) return; @@ -543,8 +553,8 @@ void serial_test_tc_netkit_device(void) int err, ifindex, ifindex2; err = create_netkit(NETKIT_L3, NETKIT_PASS, NETKIT_PASS, - &ifindex, true, NETKIT_SCRUB_DEFAULT, - NETKIT_SCRUB_DEFAULT); + &ifindex, NETKIT_SCRUB_DEFAULT, + NETKIT_SCRUB_DEFAULT, FLAG_SAME_NETNS); if (err) return; @@ -655,8 +665,8 @@ static void serial_test_tc_netkit_neigh_links_target(int mode, int target) int err, ifindex; err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS, - &ifindex, false, NETKIT_SCRUB_DEFAULT, - NETKIT_SCRUB_DEFAULT); + &ifindex, NETKIT_SCRUB_DEFAULT, + NETKIT_SCRUB_DEFAULT, 0); if (err) return; @@ -733,8 +743,8 @@ static void serial_test_tc_netkit_pkt_type_mode(int mode) struct bpf_link *link; err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS, - &ifindex, true, NETKIT_SCRUB_DEFAULT, - NETKIT_SCRUB_DEFAULT); + &ifindex, NETKIT_SCRUB_DEFAULT, + NETKIT_SCRUB_DEFAULT, FLAG_SAME_NETNS); if (err) return; @@ -799,7 +809,7 @@ void serial_test_tc_netkit_pkt_type(void) serial_test_tc_netkit_pkt_type_mode(NETKIT_L3); } -static void serial_test_tc_netkit_scrub_type(int scrub) +static void serial_test_tc_netkit_scrub_type(int scrub, bool room) { LIBBPF_OPTS(bpf_netkit_opts, optl); struct test_tc_link *skel; @@ -807,7 +817,8 @@ static void serial_test_tc_netkit_scrub_type(int scrub) int err, ifindex; err = create_netkit(NETKIT_L2, NETKIT_PASS, NETKIT_PASS, - &ifindex, false, scrub, scrub); + &ifindex, scrub, scrub, + room ? FLAG_ADJUST_ROOM : 0); if (err) return; @@ -842,6 +853,8 @@ static void serial_test_tc_netkit_scrub_type(int scrub) ASSERT_EQ(skel->bss->seen_tc8, true, "seen_tc8"); ASSERT_EQ(skel->bss->mark, scrub == NETKIT_SCRUB_NONE ? MARK : 0, "mark"); ASSERT_EQ(skel->bss->prio, scrub == NETKIT_SCRUB_NONE ? PRIO : 0, "prio"); + ASSERT_EQ(skel->bss->headroom, room ? NETKIT_HEADROOM : 0, "headroom"); + ASSERT_EQ(skel->bss->tailroom, room ? NETKIT_TAILROOM : 0, "tailroom"); cleanup: test_tc_link__destroy(skel); @@ -852,6 +865,6 @@ cleanup: void serial_test_tc_netkit_scrub(void) { - serial_test_tc_netkit_scrub_type(NETKIT_SCRUB_DEFAULT); - serial_test_tc_netkit_scrub_type(NETKIT_SCRUB_NONE); + serial_test_tc_netkit_scrub_type(NETKIT_SCRUB_DEFAULT, false); + serial_test_tc_netkit_scrub_type(NETKIT_SCRUB_NONE, true); } diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c index e6a783c7f5db..937da9b7532a 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c @@ -2,6 +2,14 @@ #include <test_progs.h> #include <network_helpers.h> #include "test_xdp_context_test_run.skel.h" +#include "test_xdp_meta.skel.h" + +#define TX_ADDR "10.0.0.1" +#define RX_ADDR "10.0.0.2" +#define RX_NAME "veth0" +#define TX_NAME "veth1" +#define TX_NETNS "xdp_context_tx" +#define RX_NETNS "xdp_context_rx" void test_xdp_context_error(int prog_fd, struct bpf_test_run_opts opts, __u32 data_meta, __u32 data, __u32 data_end, @@ -103,3 +111,82 @@ void test_xdp_context_test_run(void) test_xdp_context_test_run__destroy(skel); } + +void test_xdp_context_functional(void) +{ + LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS); + LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 1); + struct netns_obj *rx_ns = NULL, *tx_ns = NULL; + struct bpf_program *tc_prog, *xdp_prog; + struct test_xdp_meta *skel = NULL; + struct nstoken *nstoken = NULL; + int rx_ifindex; + int ret; + + tx_ns = netns_new(TX_NETNS, false); + if (!ASSERT_OK_PTR(tx_ns, "create tx_ns")) + return; + + rx_ns = netns_new(RX_NETNS, false); + if (!ASSERT_OK_PTR(rx_ns, "create rx_ns")) + goto close; + + SYS(close, "ip link add " RX_NAME " netns " RX_NETNS + " type veth peer name " TX_NAME " netns " TX_NETNS); + + nstoken = open_netns(RX_NETNS); + if (!ASSERT_OK_PTR(nstoken, "setns rx_ns")) + goto close; + + SYS(close, "ip addr add " RX_ADDR "/24 dev " RX_NAME); + SYS(close, "ip link set dev " RX_NAME " up"); + + skel = test_xdp_meta__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open and load skeleton")) + goto close; + + rx_ifindex = if_nametoindex(RX_NAME); + if (!ASSERT_GE(rx_ifindex, 0, "if_nametoindex rx")) + goto close; + + tc_hook.ifindex = rx_ifindex; + ret = bpf_tc_hook_create(&tc_hook); + if (!ASSERT_OK(ret, "bpf_tc_hook_create")) + goto close; + + tc_prog = bpf_object__find_program_by_name(skel->obj, "ing_cls"); + if (!ASSERT_OK_PTR(tc_prog, "open ing_cls prog")) + goto close; + + tc_opts.prog_fd = bpf_program__fd(tc_prog); + ret = bpf_tc_attach(&tc_hook, &tc_opts); + if (!ASSERT_OK(ret, "bpf_tc_attach")) + goto close; + + xdp_prog = bpf_object__find_program_by_name(skel->obj, "ing_xdp"); + if (!ASSERT_OK_PTR(xdp_prog, "open ing_xdp prog")) + goto close; + + ret = bpf_xdp_attach(rx_ifindex, + bpf_program__fd(xdp_prog), + 0, NULL); + if (!ASSERT_GE(ret, 0, "bpf_xdp_attach")) + goto close; + + close_netns(nstoken); + + nstoken = open_netns(TX_NETNS); + if (!ASSERT_OK_PTR(nstoken, "setns tx_ns")) + goto close; + + SYS(close, "ip addr add " TX_ADDR "/24 dev " TX_NAME); + SYS(close, "ip link set dev " TX_NAME " up"); + ASSERT_OK(SYS_NOFAIL("ping -c 1 " RX_ADDR), "ping"); + +close: + close_netns(nstoken); + test_xdp_meta__destroy(skel); + netns_free(rx_ns); + netns_free(tx_ns); +} + diff --git a/tools/testing/selftests/bpf/progs/test_tc_link.c b/tools/testing/selftests/bpf/progs/test_tc_link.c index 10d825928499..630f12e51b07 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_link.c +++ b/tools/testing/selftests/bpf/progs/test_tc_link.c @@ -8,6 +8,7 @@ #include <linux/if_packet.h> #include <bpf/bpf_endian.h> #include <bpf/bpf_helpers.h> +#include <bpf/bpf_core_read.h> char LICENSE[] SEC("license") = "GPL"; @@ -27,6 +28,7 @@ bool seen_host; bool seen_mcast; int mark, prio; +unsigned short headroom, tailroom; SEC("tc/ingress") int tc1(struct __sk_buff *skb) @@ -104,11 +106,24 @@ out: return TCX_PASS; } +struct sk_buff { + struct net_device *dev; +}; + +struct net_device { + unsigned short needed_headroom; + unsigned short needed_tailroom; +}; + SEC("tc/egress") int tc8(struct __sk_buff *skb) { + struct net_device *dev = BPF_CORE_READ((struct sk_buff *)skb, dev); + seen_tc8 = true; mark = skb->mark; prio = skb->priority; + headroom = BPF_CORE_READ(dev, needed_headroom); + tailroom = BPF_CORE_READ(dev, needed_tailroom); return TCX_PASS; } diff --git a/tools/testing/selftests/bpf/progs/test_xdp_meta.c b/tools/testing/selftests/bpf/progs/test_xdp_meta.c index a7c4a7d49fe6..fe2d71ae0e71 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_meta.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_meta.c @@ -8,7 +8,7 @@ #define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1) #define ctx_ptr(ctx, mem) (void *)(unsigned long)ctx->mem -SEC("t") +SEC("tc") int ing_cls(struct __sk_buff *ctx) { __u8 *data, *data_meta, *data_end; @@ -28,7 +28,7 @@ int ing_cls(struct __sk_buff *ctx) return diff ? TC_ACT_SHOT : TC_ACT_OK; } -SEC("x") +SEC("xdp") int ing_xdp(struct xdp_md *ctx) { __u8 *data, *data_meta, *data_end; diff --git a/tools/testing/selftests/bpf/test_xdp_meta.sh b/tools/testing/selftests/bpf/test_xdp_meta.sh deleted file mode 100755 index 2740322c1878..000000000000 --- a/tools/testing/selftests/bpf/test_xdp_meta.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/sh - -BPF_FILE="test_xdp_meta.bpf.o" -# Kselftest framework requirement - SKIP code is 4. -readonly KSFT_SKIP=4 -readonly NS1="ns1-$(mktemp -u XXXXXX)" -readonly NS2="ns2-$(mktemp -u XXXXXX)" - -cleanup() -{ - if [ "$?" = "0" ]; then - echo "selftests: test_xdp_meta [PASS]"; - else - echo "selftests: test_xdp_meta [FAILED]"; - fi - - set +e - ip link del veth1 2> /dev/null - ip netns del ${NS1} 2> /dev/null - ip netns del ${NS2} 2> /dev/null -} - -ip link set dev lo xdp off 2>/dev/null > /dev/null -if [ $? -ne 0 ];then - echo "selftests: [SKIP] Could not run test without the ip xdp support" - exit $KSFT_SKIP -fi -set -e - -ip netns add ${NS1} -ip netns add ${NS2} - -trap cleanup 0 2 3 6 9 - -ip link add veth1 type veth peer name veth2 - -ip link set veth1 netns ${NS1} -ip link set veth2 netns ${NS2} - -ip netns exec ${NS1} ip addr add 10.1.1.11/24 dev veth1 -ip netns exec ${NS2} ip addr add 10.1.1.22/24 dev veth2 - -ip netns exec ${NS1} tc qdisc add dev veth1 clsact -ip netns exec ${NS2} tc qdisc add dev veth2 clsact - -ip netns exec ${NS1} tc filter add dev veth1 ingress bpf da obj ${BPF_FILE} sec t -ip netns exec ${NS2} tc filter add dev veth2 ingress bpf da obj ${BPF_FILE} sec t - -ip netns exec ${NS1} ip link set dev veth1 xdp obj ${BPF_FILE} sec x -ip netns exec ${NS2} ip link set dev veth2 xdp obj ${BPF_FILE} sec x - -ip netns exec ${NS1} ip link set dev veth1 up -ip netns exec ${NS2} ip link set dev veth2 up - -ip netns exec ${NS1} ping -c 1 10.1.1.22 -ip netns exec ${NS2} ping -c 1 10.1.1.11 - -exit 0 diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c index 6f9956eed797..e38675d9b118 100644 --- a/tools/testing/selftests/bpf/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c @@ -79,7 +79,7 @@ static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id) .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, - .flags = XSK_UMEM__DEFAULT_FLAGS, + .flags = XDP_UMEM_TX_METADATA_LEN, .tx_metadata_len = sizeof(struct xsk_tx_metadata), }; __u32 idx = 0; @@ -551,6 +551,7 @@ static void hwtstamp_enable(const char *ifname) { struct hwtstamp_config cfg = { .rx_filter = HWTSTAMP_FILTER_ALL, + .tx_type = HWTSTAMP_TX_ON, }; hwtstamp_ioctl(SIOCGHWTSTAMP, ifname, &saved_hwtstamp_cfg); diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index 0fec8f9801ad..469179c18935 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -1,11 +1,13 @@ # SPDX-License-Identifier: GPL-2.0 TEST_INCLUDES := $(wildcard lib/py/*.py) \ + $(wildcard lib/sh/*.sh) \ ../../net/net_helper.sh \ ../../net/lib.sh \ TEST_PROGS := \ netcons_basic.sh \ + netcons_overflow.sh \ ping.py \ queues.py \ stats.py \ diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile index 03a089165d3f..2b10854e4b1e 100644 --- a/tools/testing/selftests/drivers/net/bonding/Makefile +++ b/tools/testing/selftests/drivers/net/bonding/Makefile @@ -10,7 +10,7 @@ TEST_PROGS := \ mode-2-recovery-updelay.sh \ bond_options.sh \ bond-eth-type-change.sh \ - bond_macvlan.sh + bond_macvlan_ipvlan.sh TEST_FILES := \ lag_lib.sh \ diff --git a/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh b/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh deleted file mode 100755 index b609fb6231f4..000000000000 --- a/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Test macvlan over balance-alb - -lib_dir=$(dirname "$0") -source ${lib_dir}/bond_topo_2d1c.sh - -m1_ns="m1-$(mktemp -u XXXXXX)" -m2_ns="m1-$(mktemp -u XXXXXX)" -m1_ip4="192.0.2.11" -m1_ip6="2001:db8::11" -m2_ip4="192.0.2.12" -m2_ip6="2001:db8::12" - -cleanup() -{ - ip -n ${m1_ns} link del macv0 - ip netns del ${m1_ns} - ip -n ${m2_ns} link del macv0 - ip netns del ${m2_ns} - - client_destroy - server_destroy - gateway_destroy -} - -check_connection() -{ - local ns=${1} - local target=${2} - local message=${3:-"macvlan_over_bond"} - RET=0 - - - ip netns exec ${ns} ping ${target} -c 4 -i 0.1 &>/dev/null - check_err $? "ping failed" - log_test "$mode: $message" -} - -macvlan_over_bond() -{ - local param="$1" - RET=0 - - # setup new bond mode - bond_reset "${param}" - - ip -n ${s_ns} link add link bond0 name macv0 type macvlan mode bridge - ip -n ${s_ns} link set macv0 netns ${m1_ns} - ip -n ${m1_ns} link set dev macv0 up - ip -n ${m1_ns} addr add ${m1_ip4}/24 dev macv0 - ip -n ${m1_ns} addr add ${m1_ip6}/24 dev macv0 - - ip -n ${s_ns} link add link bond0 name macv0 type macvlan mode bridge - ip -n ${s_ns} link set macv0 netns ${m2_ns} - ip -n ${m2_ns} link set dev macv0 up - ip -n ${m2_ns} addr add ${m2_ip4}/24 dev macv0 - ip -n ${m2_ns} addr add ${m2_ip6}/24 dev macv0 - - sleep 2 - - check_connection "${c_ns}" "${s_ip4}" "IPv4: client->server" - check_connection "${c_ns}" "${s_ip6}" "IPv6: client->server" - check_connection "${c_ns}" "${m1_ip4}" "IPv4: client->macvlan_1" - check_connection "${c_ns}" "${m1_ip6}" "IPv6: client->macvlan_1" - check_connection "${c_ns}" "${m2_ip4}" "IPv4: client->macvlan_2" - check_connection "${c_ns}" "${m2_ip6}" "IPv6: client->macvlan_2" - check_connection "${m1_ns}" "${m2_ip4}" "IPv4: macvlan_1->macvlan_2" - check_connection "${m1_ns}" "${m2_ip6}" "IPv6: macvlan_1->macvlan_2" - - - sleep 5 - - check_connection "${s_ns}" "${c_ip4}" "IPv4: server->client" - check_connection "${s_ns}" "${c_ip6}" "IPv6: server->client" - check_connection "${m1_ns}" "${c_ip4}" "IPv4: macvlan_1->client" - check_connection "${m1_ns}" "${c_ip6}" "IPv6: macvlan_1->client" - check_connection "${m2_ns}" "${c_ip4}" "IPv4: macvlan_2->client" - check_connection "${m2_ns}" "${c_ip6}" "IPv6: macvlan_2->client" - check_connection "${m2_ns}" "${m1_ip4}" "IPv4: macvlan_2->macvlan_2" - check_connection "${m2_ns}" "${m1_ip6}" "IPv6: macvlan_2->macvlan_2" - - ip -n ${c_ns} neigh flush dev eth0 -} - -trap cleanup EXIT - -setup_prepare -ip netns add ${m1_ns} -ip netns add ${m2_ns} - -modes="active-backup balance-tlb balance-alb" - -for mode in $modes; do - macvlan_over_bond "mode $mode" -done - -exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh b/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh new file mode 100755 index 000000000000..c4711272fe45 --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test macvlan/ipvlan over bond + +lib_dir=$(dirname "$0") +source ${lib_dir}/bond_topo_2d1c.sh + +xvlan1_ns="xvlan1-$(mktemp -u XXXXXX)" +xvlan2_ns="xvlan2-$(mktemp -u XXXXXX)" +xvlan1_ip4="192.0.2.11" +xvlan1_ip6="2001:db8::11" +xvlan2_ip4="192.0.2.12" +xvlan2_ip6="2001:db8::12" + +cleanup() +{ + client_destroy + server_destroy + gateway_destroy + + ip netns del ${xvlan1_ns} + ip netns del ${xvlan2_ns} +} + +check_connection() +{ + local ns=${1} + local target=${2} + local message=${3} + RET=0 + + ip netns exec ${ns} ping ${target} -c 4 -i 0.1 &>/dev/null + check_err $? "ping failed" + log_test "${bond_mode}/${xvlan_type}_${xvlan_mode}: ${message}" +} + +xvlan_over_bond() +{ + local param="$1" + local xvlan_type="$2" + local xvlan_mode="$3" + RET=0 + + # setup new bond mode + bond_reset "${param}" + + ip -n ${s_ns} link add link bond0 name ${xvlan_type}0 type ${xvlan_type} mode ${xvlan_mode} + ip -n ${s_ns} link set ${xvlan_type}0 netns ${xvlan1_ns} + ip -n ${xvlan1_ns} link set dev ${xvlan_type}0 up + ip -n ${xvlan1_ns} addr add ${xvlan1_ip4}/24 dev ${xvlan_type}0 + ip -n ${xvlan1_ns} addr add ${xvlan1_ip6}/24 dev ${xvlan_type}0 + + ip -n ${s_ns} link add link bond0 name ${xvlan_type}0 type ${xvlan_type} mode ${xvlan_mode} + ip -n ${s_ns} link set ${xvlan_type}0 netns ${xvlan2_ns} + ip -n ${xvlan2_ns} link set dev ${xvlan_type}0 up + ip -n ${xvlan2_ns} addr add ${xvlan2_ip4}/24 dev ${xvlan_type}0 + ip -n ${xvlan2_ns} addr add ${xvlan2_ip6}/24 dev ${xvlan_type}0 + + sleep 2 + + check_connection "${c_ns}" "${s_ip4}" "IPv4: client->server" + check_connection "${c_ns}" "${s_ip6}" "IPv6: client->server" + check_connection "${c_ns}" "${xvlan1_ip4}" "IPv4: client->${xvlan_type}_1" + check_connection "${c_ns}" "${xvlan1_ip6}" "IPv6: client->${xvlan_type}_1" + check_connection "${c_ns}" "${xvlan2_ip4}" "IPv4: client->${xvlan_type}_2" + check_connection "${c_ns}" "${xvlan2_ip6}" "IPv6: client->${xvlan_type}_2" + check_connection "${xvlan1_ns}" "${xvlan2_ip4}" "IPv4: ${xvlan_type}_1->${xvlan_type}_2" + check_connection "${xvlan1_ns}" "${xvlan2_ip6}" "IPv6: ${xvlan_type}_1->${xvlan_type}_2" + + check_connection "${s_ns}" "${c_ip4}" "IPv4: server->client" + check_connection "${s_ns}" "${c_ip6}" "IPv6: server->client" + check_connection "${xvlan1_ns}" "${c_ip4}" "IPv4: ${xvlan_type}_1->client" + check_connection "${xvlan1_ns}" "${c_ip6}" "IPv6: ${xvlan_type}_1->client" + check_connection "${xvlan2_ns}" "${c_ip4}" "IPv4: ${xvlan_type}_2->client" + check_connection "${xvlan2_ns}" "${c_ip6}" "IPv6: ${xvlan_type}_2->client" + check_connection "${xvlan2_ns}" "${xvlan1_ip4}" "IPv4: ${xvlan_type}_2->${xvlan_type}_1" + check_connection "${xvlan2_ns}" "${xvlan1_ip6}" "IPv6: ${xvlan_type}_2->${xvlan_type}_1" + + ip -n ${c_ns} neigh flush dev eth0 +} + +trap cleanup EXIT + +setup_prepare +ip netns add ${xvlan1_ns} +ip netns add ${xvlan2_ns} + +bond_modes="active-backup balance-tlb balance-alb" + +for bond_mode in ${bond_modes}; do + xvlan_over_bond "mode ${bond_mode}" macvlan bridge + xvlan_over_bond "mode ${bond_mode}" ipvlan l2 +done + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config index 899d7fb6ea8e..dad4e5fda4db 100644 --- a/tools/testing/selftests/drivers/net/bonding/config +++ b/tools/testing/selftests/drivers/net/bonding/config @@ -3,6 +3,7 @@ CONFIG_BRIDGE=y CONFIG_DUMMY=y CONFIG_IPV6=y CONFIG_MACVLAN=y +CONFIG_IPVLAN=y CONFIG_NET_ACT_GACT=y CONFIG_NET_CLS_FLOWER=y CONFIG_NET_SCH_INGRESS=y diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index fea343f209ea..987e452d3a45 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -48,6 +48,7 @@ class NetDrvEnv: else: self._ns = NetdevSimDev(**kwargs) self.dev = self._ns.nsims[0].dev + self.ifname = self.dev['ifname'] self.ifindex = self.dev['ifindex'] def __enter__(self): diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh new file mode 100644 index 000000000000..3acaba41ac7b --- /dev/null +++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh @@ -0,0 +1,225 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# This file contains functions and helpers to support the netconsole +# selftests +# +# Author: Breno Leitao <leitao@debian.org> + +set -euo pipefail + +LIBDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +SRCIF="" # to be populated later +SRCIP=192.0.2.1 +DSTIF="" # to be populated later +DSTIP=192.0.2.2 + +PORT="6666" +MSG="netconsole selftest" +USERDATA_KEY="key" +USERDATA_VALUE="value" +TARGET=$(mktemp -u netcons_XXXXX) +DEFAULT_PRINTK_VALUES=$(cat /proc/sys/kernel/printk) +NETCONS_CONFIGFS="/sys/kernel/config/netconsole" +NETCONS_PATH="${NETCONS_CONFIGFS}"/"${TARGET}" +# NAMESPACE will be populated by setup_ns with a random value +NAMESPACE="" + +# IDs for netdevsim +NSIM_DEV_1_ID=$((256 + RANDOM % 256)) +NSIM_DEV_2_ID=$((512 + RANDOM % 256)) +NSIM_DEV_SYS_NEW="/sys/bus/netdevsim/new_device" + +# Used to create and delete namespaces +source "${LIBDIR}"/../../../../net/lib.sh +source "${LIBDIR}"/../../../../net/net_helper.sh + +# Create netdevsim interfaces +create_ifaces() { + + echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_NEW" + echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_NEW" + udevadm settle 2> /dev/null || true + + local NSIM1=/sys/bus/netdevsim/devices/netdevsim"$NSIM_DEV_1_ID" + local NSIM2=/sys/bus/netdevsim/devices/netdevsim"$NSIM_DEV_2_ID" + + # These are global variables + SRCIF=$(find "$NSIM1"/net -maxdepth 1 -type d ! \ + -path "$NSIM1"/net -exec basename {} \;) + DSTIF=$(find "$NSIM2"/net -maxdepth 1 -type d ! \ + -path "$NSIM2"/net -exec basename {} \;) +} + +link_ifaces() { + local NSIM_DEV_SYS_LINK="/sys/bus/netdevsim/link_device" + local SRCIF_IFIDX=$(cat /sys/class/net/"$SRCIF"/ifindex) + local DSTIF_IFIDX=$(cat /sys/class/net/"$DSTIF"/ifindex) + + exec {NAMESPACE_FD}</var/run/netns/"${NAMESPACE}" + exec {INITNS_FD}</proc/self/ns/net + + # Bind the dst interface to namespace + ip link set "${DSTIF}" netns "${NAMESPACE}" + + # Linking one device to the other one (on the other namespace} + if ! echo "${INITNS_FD}:$SRCIF_IFIDX $NAMESPACE_FD:$DSTIF_IFIDX" > $NSIM_DEV_SYS_LINK + then + echo "linking netdevsim1 with netdevsim2 should succeed" + cleanup + exit "${ksft_skip}" + fi +} + +function configure_ip() { + # Configure the IPs for both interfaces + ip netns exec "${NAMESPACE}" ip addr add "${DSTIP}"/24 dev "${DSTIF}" + ip netns exec "${NAMESPACE}" ip link set "${DSTIF}" up + + ip addr add "${SRCIP}"/24 dev "${SRCIF}" + ip link set "${SRCIF}" up +} + +function set_network() { + # setup_ns function is coming from lib.sh + setup_ns NAMESPACE + + # Create both interfaces, and assign the destination to a different + # namespace + create_ifaces + + # Link both interfaces back to back + link_ifaces + + configure_ip +} + +function create_dynamic_target() { + DSTMAC=$(ip netns exec "${NAMESPACE}" \ + ip link show "${DSTIF}" | awk '/ether/ {print $2}') + + # Create a dynamic target + mkdir "${NETCONS_PATH}" + + echo "${DSTIP}" > "${NETCONS_PATH}"/remote_ip + echo "${SRCIP}" > "${NETCONS_PATH}"/local_ip + echo "${DSTMAC}" > "${NETCONS_PATH}"/remote_mac + echo "${SRCIF}" > "${NETCONS_PATH}"/dev_name + + echo 1 > "${NETCONS_PATH}"/enabled +} + +function cleanup() { + local NSIM_DEV_SYS_DEL="/sys/bus/netdevsim/del_device" + + # delete netconsole dynamic reconfiguration + echo 0 > "${NETCONS_PATH}"/enabled + # Remove all the keys that got created during the selftest + find "${NETCONS_PATH}/userdata/" -mindepth 1 -type d -delete + # Remove the configfs entry + rmdir "${NETCONS_PATH}" + + # Delete netdevsim devices + echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_DEL" + echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_DEL" + + # this is coming from lib.sh + cleanup_all_ns + + # Restoring printk configurations + echo "${DEFAULT_PRINTK_VALUES}" > /proc/sys/kernel/printk +} + +function set_user_data() { + if [[ ! -d "${NETCONS_PATH}""/userdata" ]] + then + echo "Userdata path not available in ${NETCONS_PATH}/userdata" + exit "${ksft_skip}" + fi + + KEY_PATH="${NETCONS_PATH}/userdata/${USERDATA_KEY}" + mkdir -p "${KEY_PATH}" + VALUE_PATH="${KEY_PATH}""/value" + echo "${USERDATA_VALUE}" > "${VALUE_PATH}" +} + +function listen_port_and_save_to() { + local OUTPUT=${1} + # Just wait for 2 seconds + timeout 2 ip netns exec "${NAMESPACE}" \ + socat UDP-LISTEN:"${PORT}",fork "${OUTPUT}" +} + +function validate_result() { + local TMPFILENAME="$1" + + # TMPFILENAME will contain something like: + # 6.11.1-0_fbk0_rc13_509_g30d75cea12f7,13,1822,115075213798,-;netconsole selftest: netcons_gtJHM + # key=value + + # Check if the file exists + if [ ! -f "$TMPFILENAME" ]; then + echo "FAIL: File was not generated." >&2 + exit "${ksft_fail}" + fi + + if ! grep -q "${MSG}" "${TMPFILENAME}"; then + echo "FAIL: ${MSG} not found in ${TMPFILENAME}" >&2 + cat "${TMPFILENAME}" >&2 + exit "${ksft_fail}" + fi + + if ! grep -q "${USERDATA_KEY}=${USERDATA_VALUE}" "${TMPFILENAME}"; then + echo "FAIL: ${USERDATA_KEY}=${USERDATA_VALUE} not found in ${TMPFILENAME}" >&2 + cat "${TMPFILENAME}" >&2 + exit "${ksft_fail}" + fi + + # Delete the file once it is validated, otherwise keep it + # for debugging purposes + rm "${TMPFILENAME}" + exit "${ksft_pass}" +} + +function check_for_dependencies() { + if [ "$(id -u)" -ne 0 ]; then + echo "This test must be run as root" >&2 + exit "${ksft_skip}" + fi + + if ! which socat > /dev/null ; then + echo "SKIP: socat(1) is not available" >&2 + exit "${ksft_skip}" + fi + + if ! which ip > /dev/null ; then + echo "SKIP: ip(1) is not available" >&2 + exit "${ksft_skip}" + fi + + if ! which udevadm > /dev/null ; then + echo "SKIP: udevadm(1) is not available" >&2 + exit "${ksft_skip}" + fi + + if [ ! -f "${NSIM_DEV_SYS_NEW}" ]; then + echo "SKIP: file ${NSIM_DEV_SYS_NEW} does not exist. Check if CONFIG_NETDEVSIM is enabled" >&2 + exit "${ksft_skip}" + fi + + if [ ! -d "${NETCONS_CONFIGFS}" ]; then + echo "SKIP: directory ${NETCONS_CONFIGFS} does not exist. Check if NETCONSOLE_DYNAMIC is enabled" >&2 + exit "${ksft_skip}" + fi + + if ip link show "${DSTIF}" 2> /dev/null; then + echo "SKIP: interface ${DSTIF} exists in the system. Not overwriting it." >&2 + exit "${ksft_skip}" + fi + + if ip addr list | grep -E "inet.*(${SRCIP}|${DSTIP})" 2> /dev/null; then + echo "SKIP: IPs already in use. Skipping it" >&2 + exit "${ksft_skip}" + fi +} diff --git a/tools/testing/selftests/drivers/net/netcons_basic.sh b/tools/testing/selftests/drivers/net/netcons_basic.sh index b175f4d966e5..fe765da498e8 100755 --- a/tools/testing/selftests/drivers/net/netcons_basic.sh +++ b/tools/testing/selftests/drivers/net/netcons_basic.sh @@ -18,224 +18,8 @@ set -euo pipefail SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") -# Simple script to test dynamic targets in netconsole -SRCIF="" # to be populated later -SRCIP=192.0.2.1 -DSTIF="" # to be populated later -DSTIP=192.0.2.2 +source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh -PORT="6666" -MSG="netconsole selftest" -USERDATA_KEY="key" -USERDATA_VALUE="value" -TARGET=$(mktemp -u netcons_XXXXX) -DEFAULT_PRINTK_VALUES=$(cat /proc/sys/kernel/printk) -NETCONS_CONFIGFS="/sys/kernel/config/netconsole" -NETCONS_PATH="${NETCONS_CONFIGFS}"/"${TARGET}" -KEY_PATH="${NETCONS_PATH}/userdata/${USERDATA_KEY}" -# NAMESPACE will be populated by setup_ns with a random value -NAMESPACE="" - -# IDs for netdevsim -NSIM_DEV_1_ID=$((256 + RANDOM % 256)) -NSIM_DEV_2_ID=$((512 + RANDOM % 256)) -NSIM_DEV_SYS_NEW="/sys/bus/netdevsim/new_device" - -# Used to create and delete namespaces -source "${SCRIPTDIR}"/../../net/lib.sh -source "${SCRIPTDIR}"/../../net/net_helper.sh - -# Create netdevsim interfaces -create_ifaces() { - - echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_NEW" - echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_NEW" - udevadm settle 2> /dev/null || true - - local NSIM1=/sys/bus/netdevsim/devices/netdevsim"$NSIM_DEV_1_ID" - local NSIM2=/sys/bus/netdevsim/devices/netdevsim"$NSIM_DEV_2_ID" - - # These are global variables - SRCIF=$(find "$NSIM1"/net -maxdepth 1 -type d ! \ - -path "$NSIM1"/net -exec basename {} \;) - DSTIF=$(find "$NSIM2"/net -maxdepth 1 -type d ! \ - -path "$NSIM2"/net -exec basename {} \;) -} - -link_ifaces() { - local NSIM_DEV_SYS_LINK="/sys/bus/netdevsim/link_device" - local SRCIF_IFIDX=$(cat /sys/class/net/"$SRCIF"/ifindex) - local DSTIF_IFIDX=$(cat /sys/class/net/"$DSTIF"/ifindex) - - exec {NAMESPACE_FD}</var/run/netns/"${NAMESPACE}" - exec {INITNS_FD}</proc/self/ns/net - - # Bind the dst interface to namespace - ip link set "${DSTIF}" netns "${NAMESPACE}" - - # Linking one device to the other one (on the other namespace} - if ! echo "${INITNS_FD}:$SRCIF_IFIDX $NAMESPACE_FD:$DSTIF_IFIDX" > $NSIM_DEV_SYS_LINK - then - echo "linking netdevsim1 with netdevsim2 should succeed" - cleanup - exit "${ksft_skip}" - fi -} - -function configure_ip() { - # Configure the IPs for both interfaces - ip netns exec "${NAMESPACE}" ip addr add "${DSTIP}"/24 dev "${DSTIF}" - ip netns exec "${NAMESPACE}" ip link set "${DSTIF}" up - - ip addr add "${SRCIP}"/24 dev "${SRCIF}" - ip link set "${SRCIF}" up -} - -function set_network() { - # setup_ns function is coming from lib.sh - setup_ns NAMESPACE - - # Create both interfaces, and assign the destination to a different - # namespace - create_ifaces - - # Link both interfaces back to back - link_ifaces - - configure_ip -} - -function create_dynamic_target() { - DSTMAC=$(ip netns exec "${NAMESPACE}" \ - ip link show "${DSTIF}" | awk '/ether/ {print $2}') - - # Create a dynamic target - mkdir "${NETCONS_PATH}" - - echo "${DSTIP}" > "${NETCONS_PATH}"/remote_ip - echo "${SRCIP}" > "${NETCONS_PATH}"/local_ip - echo "${DSTMAC}" > "${NETCONS_PATH}"/remote_mac - echo "${SRCIF}" > "${NETCONS_PATH}"/dev_name - - echo 1 > "${NETCONS_PATH}"/enabled -} - -function cleanup() { - local NSIM_DEV_SYS_DEL="/sys/bus/netdevsim/del_device" - - # delete netconsole dynamic reconfiguration - echo 0 > "${NETCONS_PATH}"/enabled - # Remove key - rmdir "${KEY_PATH}" - # Remove the configfs entry - rmdir "${NETCONS_PATH}" - - # Delete netdevsim devices - echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_DEL" - echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_DEL" - - # this is coming from lib.sh - cleanup_all_ns - - # Restoring printk configurations - echo "${DEFAULT_PRINTK_VALUES}" > /proc/sys/kernel/printk -} - -function set_user_data() { - if [[ ! -d "${NETCONS_PATH}""/userdata" ]] - then - echo "Userdata path not available in ${NETCONS_PATH}/userdata" - exit "${ksft_skip}" - fi - - mkdir -p "${KEY_PATH}" - VALUE_PATH="${KEY_PATH}""/value" - echo "${USERDATA_VALUE}" > "${VALUE_PATH}" -} - -function listen_port_and_save_to() { - local OUTPUT=${1} - # Just wait for 2 seconds - timeout 2 ip netns exec "${NAMESPACE}" \ - socat UDP-LISTEN:"${PORT}",fork "${OUTPUT}" -} - -function validate_result() { - local TMPFILENAME="$1" - - # TMPFILENAME will contain something like: - # 6.11.1-0_fbk0_rc13_509_g30d75cea12f7,13,1822,115075213798,-;netconsole selftest: netcons_gtJHM - # key=value - - # Check if the file exists - if [ ! -f "$TMPFILENAME" ]; then - echo "FAIL: File was not generated." >&2 - exit "${ksft_fail}" - fi - - if ! grep -q "${MSG}" "${TMPFILENAME}"; then - echo "FAIL: ${MSG} not found in ${TMPFILENAME}" >&2 - cat "${TMPFILENAME}" >&2 - exit "${ksft_fail}" - fi - - if ! grep -q "${USERDATA_KEY}=${USERDATA_VALUE}" "${TMPFILENAME}"; then - echo "FAIL: ${USERDATA_KEY}=${USERDATA_VALUE} not found in ${TMPFILENAME}" >&2 - cat "${TMPFILENAME}" >&2 - exit "${ksft_fail}" - fi - - # Delete the file once it is validated, otherwise keep it - # for debugging purposes - rm "${TMPFILENAME}" - exit "${ksft_pass}" -} - -function check_for_dependencies() { - if [ "$(id -u)" -ne 0 ]; then - echo "This test must be run as root" >&2 - exit "${ksft_skip}" - fi - - if ! which socat > /dev/null ; then - echo "SKIP: socat(1) is not available" >&2 - exit "${ksft_skip}" - fi - - if ! which ip > /dev/null ; then - echo "SKIP: ip(1) is not available" >&2 - exit "${ksft_skip}" - fi - - if ! which udevadm > /dev/null ; then - echo "SKIP: udevadm(1) is not available" >&2 - exit "${ksft_skip}" - fi - - if [ ! -f "${NSIM_DEV_SYS_NEW}" ]; then - echo "SKIP: file ${NSIM_DEV_SYS_NEW} does not exist. Check if CONFIG_NETDEVSIM is enabled" >&2 - exit "${ksft_skip}" - fi - - if [ ! -d "${NETCONS_CONFIGFS}" ]; then - echo "SKIP: directory ${NETCONS_CONFIGFS} does not exist. Check if NETCONSOLE_DYNAMIC is enabled" >&2 - exit "${ksft_skip}" - fi - - if ip link show "${DSTIF}" 2> /dev/null; then - echo "SKIP: interface ${DSTIF} exists in the system. Not overwriting it." >&2 - exit "${ksft_skip}" - fi - - if ip addr list | grep -E "inet.*(${SRCIP}|${DSTIP})" 2> /dev/null; then - echo "SKIP: IPs already in use. Skipping it" >&2 - exit "${ksft_skip}" - fi -} - -# ========== # -# Start here # -# ========== # modprobe netdevsim 2> /dev/null || true modprobe netconsole 2> /dev/null || true diff --git a/tools/testing/selftests/drivers/net/netcons_overflow.sh b/tools/testing/selftests/drivers/net/netcons_overflow.sh new file mode 100755 index 000000000000..29bad56448a2 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netcons_overflow.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# This test verifies that users can successfully create up to +# MAX_USERDATA_ITEMS userdata entries without encountering any failures. +# +# Additionally, it tests for expected failure when attempting to exceed this +# maximum limit. +# +# Author: Breno Leitao <leitao@debian.org> + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh +# This is coming from netconsole code. Check for it in drivers/net/netconsole.c +MAX_USERDATA_ITEMS=16 + +# Function to create userdata entries +function create_userdata_max_entries() { + # All these keys should be created without any error + for i in $(seq $MAX_USERDATA_ITEMS) + do + # USERDATA_KEY is used by set_user_data + USERDATA_KEY="key"${i} + set_user_data + done +} + +# Function to verify the entry limit +function verify_entry_limit() { + # Allowing the test to fail without exiting, since the next command + # will fail + set +e + mkdir "${NETCONS_PATH}/userdata/key_that_will_fail" 2> /dev/null + ret="$?" + set -e + if [ "$ret" -eq 0 ]; + then + echo "Adding more than ${MAX_USERDATA_ITEMS} entries in userdata should fail, but it didn't" >&2 + ls "${NETCONS_PATH}/userdata/" >&2 + exit "${ksft_fail}" + fi +} + +# ========== # +# Start here # +# ========== # + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true + +# Check for basic system dependency and exit if not found +check_for_dependencies + +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup EXIT +# Create one namespace and two interfaces +set_network +# Create a dynamic target for netconsole +create_dynamic_target +# populate the maximum number of supported keys in userdata +create_userdata_max_entries +# Verify an additional entry is not allowed +verify_entry_limit +exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py index 031ac9def6c0..efcc1e10575b 100755 --- a/tools/testing/selftests/drivers/net/stats.py +++ b/tools/testing/selftests/drivers/net/stats.py @@ -2,12 +2,15 @@ # SPDX-License-Identifier: GPL-2.0 import errno +import subprocess +import time from lib.py import ksft_run, ksft_exit, ksft_pr -from lib.py import ksft_ge, ksft_eq, ksft_in, ksft_true, ksft_raises, KsftSkipEx, KsftXfailEx +from lib.py import ksft_ge, ksft_eq, ksft_is, ksft_in, ksft_lt, ksft_true, ksft_raises +from lib.py import KsftSkipEx, KsftXfailEx from lib.py import ksft_disruptive from lib.py import EthtoolFamily, NetdevFamily, RtnlFamily, NlError from lib.py import NetDrvEnv -from lib.py import ip, defer +from lib.py import cmd, ip, defer ethnl = EthtoolFamily() netfam = NetdevFamily() @@ -174,10 +177,95 @@ def check_down(cfg) -> None: netfam.qstats_get({"ifindex": cfg.ifindex, "scope": "queue"}, dump=True) +def __run_inf_loop(body): + body = body.strip() + if body[-1] != ';': + body += ';' + + return subprocess.Popen(f"while true; do {body} done", shell=True, + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + +def __stats_increase_sanely(old, new) -> None: + for k in old.keys(): + ksft_ge(new[k], old[k]) + ksft_lt(new[k] - old[k], 1 << 31, comment="likely wrapping error") + + +def procfs_hammer(cfg) -> None: + """ + Reading stats via procfs only holds the RCU lock, which is not an exclusive + lock, make sure drivers can handle parallel reads of stats. + """ + one = __run_inf_loop("cat /proc/net/dev") + defer(one.kill) + two = __run_inf_loop("cat /proc/net/dev") + defer(two.kill) + + time.sleep(1) + # Make sure the processes are running + ksft_is(one.poll(), None) + ksft_is(two.poll(), None) + + rtstat1 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64'] + time.sleep(2) + rtstat2 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64'] + __stats_increase_sanely(rtstat1, rtstat2) + # defers will kill the loops + + +@ksft_disruptive +def procfs_downup_hammer(cfg) -> None: + """ + Reading stats via procfs only holds the RCU lock, drivers often try + to sleep when reading the stats, or don't protect against races. + """ + # Max out the queues, we'll flip between max and 1 + channels = ethnl.channels_get({'header': {'dev-index': cfg.ifindex}}) + if channels['combined-count'] == 0: + rx_type = 'rx' + else: + rx_type = 'combined' + cur_queue_cnt = channels[f'{rx_type}-count'] + max_queue_cnt = channels[f'{rx_type}-max'] + + cmd(f"ethtool -L {cfg.ifname} {rx_type} {max_queue_cnt}") + defer(cmd, f"ethtool -L {cfg.ifname} {rx_type} {cur_queue_cnt}") + + # Real test stats + stats = __run_inf_loop("cat /proc/net/dev") + defer(stats.kill) + + ipset = f"ip link set dev {cfg.ifname}" + defer(ip, f"link set dev {cfg.ifname} up") + # The "echo -n 1" lets us count iterations below + updown = f"{ipset} down; sleep 0.05; {ipset} up; sleep 0.05; " + \ + f"ethtool -L {cfg.ifname} {rx_type} 1; " + \ + f"ethtool -L {cfg.ifname} {rx_type} {max_queue_cnt}; " + \ + "echo -n 1" + updown = __run_inf_loop(updown) + kill_updown = defer(updown.kill) + + time.sleep(1) + # Make sure the processes are running + ksft_is(stats.poll(), None) + ksft_is(updown.poll(), None) + + rtstat1 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64'] + # We're looking for crashes, give it extra time + time.sleep(9) + rtstat2 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64'] + __stats_increase_sanely(rtstat1, rtstat2) + + kill_updown.exec() + stdout, _ = updown.communicate(timeout=5) + ksft_pr("completed up/down cycles:", len(stdout.decode('utf-8'))) + + def main() -> None: with NetDrvEnv(__file__, queue_count=100) as cfg: ksft_run([check_pause, check_fec, pkt_byte_sum, qstat_by_ifindex, - check_down], + check_down, procfs_hammer, procfs_downup_hammer], args=(cfg, )) ksft_exit() diff --git a/tools/testing/selftests/kselftest/ktap_helpers.sh b/tools/testing/selftests/kselftest/ktap_helpers.sh index 79a125eb24c2..05a461890671 100644 --- a/tools/testing/selftests/kselftest/ktap_helpers.sh +++ b/tools/testing/selftests/kselftest/ktap_helpers.sh @@ -7,6 +7,7 @@ KTAP_TESTNO=1 KTAP_CNT_PASS=0 KTAP_CNT_FAIL=0 +KTAP_CNT_XFAIL=0 KTAP_CNT_SKIP=0 KSFT_PASS=0 @@ -69,6 +70,16 @@ ktap_test_skip() { KTAP_CNT_SKIP=$((KTAP_CNT_SKIP+1)) } +ktap_test_xfail() { + description="$1" + + result="ok" + directive="XFAIL" + __ktap_test "$result" "$description" "$directive" + + KTAP_CNT_XFAIL=$((KTAP_CNT_XFAIL+1)) +} + ktap_test_fail() { description="$1" @@ -99,7 +110,7 @@ ktap_exit_fail_msg() { ktap_finished() { ktap_print_totals - if [ $((KTAP_CNT_PASS + KTAP_CNT_SKIP)) -eq "$KSFT_NUM_TESTS" ]; then + if [ $((KTAP_CNT_PASS + KTAP_CNT_SKIP + KTAP_CNT_XFAIL)) -eq "$KSFT_NUM_TESTS" ]; then exit "$KSFT_PASS" else exit "$KSFT_FAIL" @@ -107,5 +118,5 @@ ktap_finished() { } ktap_print_totals() { - echo "# Totals: pass:$KTAP_CNT_PASS fail:$KTAP_CNT_FAIL xfail:0 xpass:0 skip:$KTAP_CNT_SKIP error:0" + echo "# Totals: pass:$KTAP_CNT_PASS fail:$KTAP_CNT_FAIL xfail:$KTAP_CNT_XFAIL xpass:0 skip:$KTAP_CNT_SKIP error:0" } diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 0a0b55516028..c0c53451a16d 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -282,6 +282,24 @@ static void *mfd_assert_mmap_shared(int fd) return p; } +static void *mfd_assert_mmap_read_shared(int fd) +{ + void *p; + + p = mmap(NULL, + mfd_def_size, + PROT_READ, + MAP_SHARED, + fd, + 0); + if (p == MAP_FAILED) { + printf("mmap() failed: %m\n"); + abort(); + } + + return p; +} + static void *mfd_assert_mmap_private(int fd) { void *p; @@ -980,6 +998,30 @@ static void test_seal_future_write(void) close(fd); } +static void test_seal_write_map_read_shared(void) +{ + int fd; + void *p; + + printf("%s SEAL-WRITE-MAP-READ\n", memfd_str); + + fd = mfd_assert_new("kern_memfd_seal_write_map_read", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + mfd_assert_add_seals(fd, F_SEAL_WRITE); + mfd_assert_has_seals(fd, F_SEAL_WRITE); + + p = mfd_assert_mmap_read_shared(fd); + + mfd_assert_read(fd); + mfd_assert_read_shared(fd); + mfd_fail_write(fd); + + munmap(p, mfd_def_size); + close(fd); +} + /* * Test SEAL_SHRINK * Test whether SEAL_SHRINK actually prevents shrinking @@ -1593,6 +1635,7 @@ int main(int argc, char **argv) test_seal_write(); test_seal_future_write(); + test_seal_write_map_read_shared(); test_seal_shrink(); test_seal_grow(); test_seal_resize(); diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index 477ae76de93d..3efe005436cd 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -71,6 +71,11 @@ def ksft_in(a, b, comment=""): _fail("Check failed", a, "not in", b, comment) +def ksft_is(a, b, comment=""): + if a is not b: + _fail("Check failed", a, "is not", b, comment) + + def ksft_ge(a, b, comment=""): if a < b: _fail("Check failed", a, "<", b, comment) diff --git a/tools/testing/selftests/net/lib/py/ynl.py b/tools/testing/selftests/net/lib/py/ynl.py index 076a7e8dc3eb..ad1e36baee2a 100644 --- a/tools/testing/selftests/net/lib/py/ynl.py +++ b/tools/testing/selftests/net/lib/py/ynl.py @@ -13,14 +13,14 @@ try: SPEC_PATH = KSFT_DIR / "net/lib/specs" sys.path.append(tools_full_path.as_posix()) - from net.lib.ynl.lib import YnlFamily, NlError + from net.lib.ynl.pyynl.lib import YnlFamily, NlError else: # Running in tree tools_full_path = KSRC / "tools" SPEC_PATH = KSRC / "Documentation/netlink/specs" sys.path.append(tools_full_path.as_posix()) - from net.ynl.lib import YnlFamily, NlError + from net.ynl.pyynl.lib import YnlFamily, NlError except ModuleNotFoundError as e: ksft_pr("Failed importing `ynl` library from kernel sources") ksft_pr(str(e)) diff --git a/tools/testing/selftests/net/nl_netdev.py b/tools/testing/selftests/net/nl_netdev.py index 93d9d914529b..93e8cb671c3d 100755 --- a/tools/testing/selftests/net/nl_netdev.py +++ b/tools/testing/selftests/net/nl_netdev.py @@ -18,6 +18,23 @@ def lo_check(nf) -> None: ksft_eq(len(lo_info['xdp-rx-metadata-features']), 0) +def napi_list_check(nf) -> None: + with NetdevSimDev(queue_count=100) as nsimdev: + nsim = nsimdev.nsims[0] + + ip(f"link set dev {nsim.ifname} up") + + napis = nf.napi_get({'ifindex': nsim.ifindex}, dump=True) + ksft_eq(len(napis), 100) + + for q in [50, 0, 99]: + for i in range(4): + nsim.dfs_write("queue_reset", f"{q} {i}") + napis = nf.napi_get({'ifindex': nsim.ifindex}, dump=True) + ksft_eq(len(napis), 100, + comment=f"queue count after reset queue {q} mode {i}") + + def page_pool_check(nf) -> None: with NetdevSimDev() as nsimdev: nsim = nsimdev.nsims[0] @@ -89,7 +106,7 @@ def page_pool_check(nf) -> None: def main() -> None: nf = NetdevFamily() - ksft_run([empty_check, lo_check, page_pool_check], + ksft_run([empty_check, lo_check, page_pool_check, napi_list_check], args=(nf, )) ksft_exit() diff --git a/tools/testing/selftests/net/packetdrill/ksft_runner.sh b/tools/testing/selftests/net/packetdrill/ksft_runner.sh index 4071c133f29e..ff989c325eef 100755 --- a/tools/testing/selftests/net/packetdrill/ksft_runner.sh +++ b/tools/testing/selftests/net/packetdrill/ksft_runner.sh @@ -23,7 +23,7 @@ if [ $# -ne 1 ]; then ktap_exit_fail_msg "usage: $0 <script>" exit "$KSFT_FAIL" fi -script="$1" +script="$(basename $1)" if [ -z "$(which packetdrill)" ]; then ktap_skip_all "packetdrill not found in PATH" @@ -31,16 +31,29 @@ if [ -z "$(which packetdrill)" ]; then fi declare -a optargs +failfunc=ktap_test_fail + if [[ -n "${KSFT_MACHINE_SLOW}" ]]; then optargs+=('--tolerance_usecs=14000') + + # xfail tests that are known flaky with dbg config, not fixable. + # still run them for coverage (and expect 100% pass without dbg). + declare -ar xfail_list=( + "tcp_fast_recovery_prr-ss.*.pkt" + "tcp_timestamping.*.pkt" + "tcp_user_timeout_user-timeout-probe.pkt" + "tcp_zerocopy_epoll_.*.pkt" + ) + readonly xfail_regex="^($(printf '%s|' "${xfail_list[@]}"))$" + [[ "$script" =~ ${xfail_regex} ]] && failfunc=ktap_test_xfail fi ktap_print_header ktap_set_plan 2 -unshare -n packetdrill ${ipv4_args[@]} ${optargs[@]} $(basename $script) > /dev/null \ - && ktap_test_pass "ipv4" || ktap_test_fail "ipv4" -unshare -n packetdrill ${ipv6_args[@]} ${optargs[@]} $(basename $script) > /dev/null \ - && ktap_test_pass "ipv6" || ktap_test_fail "ipv6" +unshare -n packetdrill ${ipv4_args[@]} ${optargs[@]} $script > /dev/null \ + && ktap_test_pass "ipv4" || $failfunc "ipv4" +unshare -n packetdrill ${ipv6_args[@]} ${optargs[@]} $script > /dev/null \ + && ktap_test_pass "ipv6" || $failfunc "ipv6" ktap_finished diff --git a/tools/testing/selftests/net/ynl.mk b/tools/testing/selftests/net/ynl.mk index d43afe243779..12e7cae251be 100644 --- a/tools/testing/selftests/net/ynl.mk +++ b/tools/testing/selftests/net/ynl.mk @@ -31,7 +31,8 @@ $(OUTPUT)/libynl.a: $(YNL_SPECS) $(OUTPUT)/.libynl-$(YNL_GENS_HASH).sig $(Q)cp $(top_srcdir)/tools/net/ynl/libynl.a $(OUTPUT)/libynl.a EXTRA_CLEAN += \ - $(top_srcdir)/tools/net/ynl/lib/__pycache__ \ + $(top_srcdir)/tools/net/ynl/pyynl/__pycache__ \ + $(top_srcdir)/tools/net/ynl/pyynl/lib/__pycache__ \ $(top_srcdir)/tools/net/ynl/lib/*.[ado] \ $(OUTPUT)/.libynl-*.sig \ $(OUTPUT)/libynl.a diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c index 37d9bf6fb745..6f4c3f5a1c5d 100644 --- a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c +++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c @@ -20,7 +20,7 @@ s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_select_cpu, struct task_struct *p, * If we dispatch to a bogus DSQ that will fall back to the * builtin global DSQ, we fail gracefully. */ - scx_bpf_dispatch_vtime(p, 0xcafef00d, SCX_SLICE_DFL, + scx_bpf_dsq_insert_vtime(p, 0xcafef00d, SCX_SLICE_DFL, p->scx.dsq_vtime, 0); return cpu; } diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c index dffc97d9cdf1..e4a55027778f 100644 --- a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c +++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c @@ -17,8 +17,8 @@ s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_select_cpu, struct task_struct *p, if (cpu >= 0) { /* Shouldn't be allowed to vtime dispatch to a builtin DSQ. */ - scx_bpf_dispatch_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, - p->scx.dsq_vtime, 0); + scx_bpf_dsq_insert_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, + p->scx.dsq_vtime, 0); return cpu; } diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c index 6a7db1502c29..fbda6bf54671 100644 --- a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c +++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c @@ -43,9 +43,12 @@ void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev) if (!p) return; - target = bpf_get_prandom_u32() % nr_cpus; + if (p->nr_cpus_allowed == nr_cpus) + target = bpf_get_prandom_u32() % nr_cpus; + else + target = scx_bpf_task_cpu(p); - scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0); + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0); bpf_task_release(p); } diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.c b/tools/testing/selftests/sched_ext/dsp_local_on.c index 472851b56854..0ff27e57fe43 100644 --- a/tools/testing/selftests/sched_ext/dsp_local_on.c +++ b/tools/testing/selftests/sched_ext/dsp_local_on.c @@ -34,9 +34,10 @@ static enum scx_test_status run(void *ctx) /* Just sleeping is fine, plenty of scheduling events happening */ sleep(1); - SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); bpf_link__destroy(link); + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG)); + return SCX_TEST_PASS; } @@ -50,7 +51,7 @@ static void cleanup(void *ctx) struct scx_test dsp_local_on = { .name = "dsp_local_on", .description = "Verify we can directly dispatch tasks to a local DSQs " - "from osp.dispatch()", + "from ops.dispatch()", .setup = setup, .run = run, .cleanup = cleanup, diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c index 1efb50d61040..a7cf868d5e31 100644 --- a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c +++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c @@ -31,7 +31,7 @@ void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p, /* Can only call from ops.select_cpu() */ scx_bpf_select_cpu_dfl(p, 0, 0, &found); - scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); } SEC(".struct_ops.link") diff --git a/tools/testing/selftests/sched_ext/exit.bpf.c b/tools/testing/selftests/sched_ext/exit.bpf.c index d75d4faf07f6..4bc36182d3ff 100644 --- a/tools/testing/selftests/sched_ext/exit.bpf.c +++ b/tools/testing/selftests/sched_ext/exit.bpf.c @@ -33,7 +33,7 @@ void BPF_STRUCT_OPS(exit_enqueue, struct task_struct *p, u64 enq_flags) if (exit_point == EXIT_ENQUEUE) EXIT_CLEANLY(); - scx_bpf_dispatch(p, DSQ_ID, SCX_SLICE_DFL, enq_flags); + scx_bpf_dsq_insert(p, DSQ_ID, SCX_SLICE_DFL, enq_flags); } void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p) @@ -41,7 +41,7 @@ void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p) if (exit_point == EXIT_DISPATCH) EXIT_CLEANLY(); - scx_bpf_consume(DSQ_ID); + scx_bpf_dsq_move_to_local(DSQ_ID); } void BPF_STRUCT_OPS(exit_enable, struct task_struct *p) diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c index 4d4cd8d966db..430f5e13bf55 100644 --- a/tools/testing/selftests/sched_ext/maximal.bpf.c +++ b/tools/testing/selftests/sched_ext/maximal.bpf.c @@ -12,6 +12,8 @@ char _license[] SEC("license") = "GPL"; +#define DSQ_ID 0 + s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) { @@ -20,7 +22,7 @@ s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags) { - scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + scx_bpf_dsq_insert(p, DSQ_ID, SCX_SLICE_DFL, enq_flags); } void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) @@ -28,7 +30,7 @@ void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev) { - scx_bpf_consume(SCX_DSQ_GLOBAL); + scx_bpf_dsq_move_to_local(DSQ_ID); } void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags) @@ -123,7 +125,7 @@ void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight) s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init) { - return 0; + return scx_bpf_create_dsq(DSQ_ID, -1); } void BPF_STRUCT_OPS(maximal_exit, struct scx_exit_info *info) diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c index f171ac470970..13d0f5be788d 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c @@ -30,7 +30,7 @@ void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p, } scx_bpf_put_idle_cpumask(idle_mask); - scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); } SEC(".struct_ops.link") diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c index 9efdbb7da928..815f1d5d61ac 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c @@ -67,7 +67,7 @@ void BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_enqueue, struct task_struct *p, saw_local = true; } - scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags); + scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, enq_flags); } s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task, diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c index 59bfc4f36167..4bb99699e920 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c @@ -29,7 +29,7 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p, cpu = prev_cpu; dispatch: - scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, 0); + scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, 0); return cpu; } diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c index 3bbd5fcdfb18..2a75de11b2cf 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c @@ -18,7 +18,7 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_select_cpu, struct task_struct *p s32 prev_cpu, u64 wake_flags) { /* Dispatching to a random DSQ should fail. */ - scx_bpf_dispatch(p, 0xcafef00d, SCX_SLICE_DFL, 0); + scx_bpf_dsq_insert(p, 0xcafef00d, SCX_SLICE_DFL, 0); return prev_cpu; } diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c index 0fda57fe0ecf..99d075695c97 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c @@ -18,8 +18,8 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_select_cpu, struct task_struct *p s32 prev_cpu, u64 wake_flags) { /* Dispatching twice in a row is disallowed. */ - scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); - scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); return prev_cpu; } diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c index e6c67bcf5e6e..bfcb96cd4954 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c +++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c @@ -2,8 +2,8 @@ /* * A scheduler that validates that enqueue flags are properly stored and * applied at dispatch time when a task is directly dispatched from - * ops.select_cpu(). We validate this by using scx_bpf_dispatch_vtime(), and - * making the test a very basic vtime scheduler. + * ops.select_cpu(). We validate this by using scx_bpf_dsq_insert_vtime(), + * and making the test a very basic vtime scheduler. * * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. * Copyright (c) 2024 David Vernet <dvernet@meta.com> @@ -47,13 +47,13 @@ s32 BPF_STRUCT_OPS(select_cpu_vtime_select_cpu, struct task_struct *p, cpu = prev_cpu; scx_bpf_test_and_clear_cpu_idle(cpu); ddsp: - scx_bpf_dispatch_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0); + scx_bpf_dsq_insert_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0); return cpu; } void BPF_STRUCT_OPS(select_cpu_vtime_dispatch, s32 cpu, struct task_struct *p) { - if (scx_bpf_consume(VTIME_DSQ)) + if (scx_bpf_dsq_move_to_local(VTIME_DSQ)) consumed = true; } diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json b/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json index 996448afe31b..91d120548bf5 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json +++ b/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json @@ -78,10 +78,10 @@ "setup": [ "$TC qdisc add dev $DEV1 ingress" ], - "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 protocol ip flow map key dst rshift 0xff", + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 protocol ip flow map key dst rshift 0x1f", "expExitCode": "0", "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 protocol ip prio 1 flow", - "matchPattern": "filter parent ffff: protocol ip pref 1 flow chain [0-9]+ handle 0x1 map keys dst rshift 255 baseclass", + "matchPattern": "filter parent ffff: protocol ip pref 1 flow chain [0-9]+ handle 0x1 map keys dst rshift 31 baseclass", "matchCount": "1", "teardown": [ "$TC qdisc del dev $DEV1 ingress" |