436 files changed, 12117 insertions, 3967 deletions
diff --git a/.mailmap b/.mailmap
index 7efe43237ca8..9b620deda3fc 100644
--- a/.mailmap
+++ b/.mailmap
@@ -435,7 +435,7 @@ Martin Kepplinger <martink@posteo.de> <martin.kepplinger@ginzinger.com>
 Martin Kepplinger <martink@posteo.de> <martin.kepplinger@puri.sm>
 Martin Kepplinger <martink@posteo.de> <martin.kepplinger@theobroma-systems.com>
 Martyna Szapar-Mudlaw <martyna.szapar-mudlaw@linux.intel.com> <martyna.szapar-mudlaw@intel.com>
-Mathieu Othacehe <m.othacehe@gmail.com> <othacehe@gnu.org>
+Mathieu Othacehe <othacehe@gnu.org> <m.othacehe@gmail.com>
 Mat Martineau <martineau@kernel.org> <mathew.j.martineau@linux.intel.com>
 Mat Martineau <martineau@kernel.org> <mathewm@codeaurora.org>
 Matthew Wilcox <willy@infradead.org> <matthew.r.wilcox@intel.com>
@@ -529,6 +529,8 @@ Oleksij Rempel <linux@rempel-privat.de> <external.Oleksij.Rempel@de.bosch.com>
 Oleksij Rempel <linux@rempel-privat.de> <fixed-term.Oleksij.Rempel@de.bosch.com>
 Oleksij Rempel <o.rempel@pengutronix.de>
 Oleksij Rempel <o.rempel@pengutronix.de> <ore@pengutronix.de>
+Oliver Hartkopp <socketcan@hartkopp.net> <oliver.hartkopp@volkswagen.de>
+Oliver Hartkopp <socketcan@hartkopp.net> <oliver@hartkopp.net>
 Oliver Upton <oliver.upton@linux.dev> <oupton@google.com>
 Ondřej Jirman <megi@xff.cz> <megous@megous.com>
 Oza Pawandeep <quic_poza@quicinc.com> <poza@codeaurora.org>
diff --git a/CREDITS b/CREDITS
index b1777b53c63a..cda68f04d5f1 100644
--- a/CREDITS
+++ b/CREDITS
@@ -20,6 +20,10 @@ N: Thomas Abraham
 E: thomas.ab@samsung.com
 D: Samsung pin controller driver
 
+N: Jose Abreu
+E: jose.abreu@synopsys.com
+D: Synopsys DesignWare XPCS MDIO/PCS driver.
+
 N: Dragos Acostachioaie
 E: dragos@iname.com
 W: http://www.arbornet.org/~dragos
@@ -1428,6 +1432,10 @@ S: 8124 Constitution Apt. 7
 S: Sterling Heights, Michigan 48313
 S: USA
 
+N: Andy Gospodarek
+E: andy@greyhouse.net
+D: Maintenance and contributions to the network interface bonding driver.
+
 N: Wolfgang Grandegger
 E: wg@grandegger.com
 D: Controller Area Network (device drivers)
@@ -1812,6 +1820,10 @@ D: Author/maintainer of most DRM drivers (especially ATI, MGA)
 D: Core DRM templates, general DRM and 3D-related hacking
 S: No fixed address
 
+N: Woojung Huh
+E: woojung.huh@microchip.com
+D: Microchip LAN78XX USB Ethernet driver
+
 N: Kenn Humborg
 E: kenn@wombat.ie
 D: Mods to loop device to support sparse backing files
diff --git a/Documentation/Makefile b/Documentation/Makefile
index fa71602ec961..52c6c5a3efa9 100644
--- a/Documentation/Makefile
+++ b/Documentation/Makefile
@@ -104,7 +104,7 @@ quiet_cmd_sphinx = SPHINX  $@ --> file://$(abspath $(BUILDDIR)/$3/$4)
 YNL_INDEX:=$(srctree)/Documentation/networking/netlink_spec/index.rst
 YNL_RST_DIR:=$(srctree)/Documentation/networking/netlink_spec
 YNL_YAML_DIR:=$(srctree)/Documentation/netlink/specs
-YNL_TOOL:=$(srctree)/tools/net/ynl/ynl-gen-rst.py
+YNL_TOOL:=$(srctree)/tools/net/ynl/pyynl/ynl_gen_rst.py
 
 YNL_RST_FILES_TMP := $(patsubst %.yaml,%.rst,$(wildcard $(YNL_YAML_DIR)/*.yaml))
 YNL_RST_FILES := $(patsubst $(YNL_YAML_DIR)%,$(YNL_RST_DIR)%, $(YNL_RST_FILES_TMP))
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 5034915f4e8e..8872203df088 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -436,7 +436,7 @@ AnonHugePmdMapped).
 The number of file transparent huge pages mapped to userspace is available
 by reading ShmemPmdMapped and ShmemHugePages fields in ``/proc/meminfo``.
 To identify what applications are mapping file transparent huge pages, it
-is necessary to read ``/proc/PID/smaps`` and count the FileHugeMapped fields
+is necessary to read ``/proc/PID/smaps`` and count the FilePmdMapped fields
 for each mapping.
 
 Note that reading the smaps file is expensive and reading it
diff --git a/Documentation/devicetree/bindings/net/amlogic,meson-dwmac.yaml b/Documentation/devicetree/bindings/net/amlogic,meson-dwmac.yaml
index d1e2bca3c503..798a4c19f18c 100644
--- a/Documentation/devicetree/bindings/net/amlogic,meson-dwmac.yaml
+++ b/Documentation/devicetree/bindings/net/amlogic,meson-dwmac.yaml
@@ -166,11 +166,11 @@ unevaluatedProperties: false
 examples:
   - |
     ethmac: ethernet@c9410000 {
-         compatible = "amlogic,meson-gxbb-dwmac", "snps,dwmac";
-         reg = <0xc9410000 0x10000>, <0xc8834540 0x8>;
-         interrupts = <8>;
-         interrupt-names = "macirq";
-         clocks = <&clk_eth>, <&clk_fclk_div2>, <&clk_mpll2>, <&clk_fclk_div2>;
-         clock-names = "stmmaceth", "clkin0", "clkin1", "timing-adjustment";
-         phy-mode = "rgmii";
+        compatible = "amlogic,meson-gxbb-dwmac", "snps,dwmac";
+        reg = <0xc9410000 0x10000>, <0xc8834540 0x8>;
+        interrupts = <8>;
+        interrupt-names = "macirq";
+        clocks = <&clk_eth>, <&clk_fclk_div2>, <&clk_mpll2>, <&clk_fclk_div2>;
+        clock-names = "stmmaceth", "clkin0", "clkin1", "timing-adjustment";
+        phy-mode = "rgmii";
     };
diff --git a/Documentation/devicetree/bindings/net/asix,ax88178.yaml b/Documentation/devicetree/bindings/net/asix,ax88178.yaml
index 768504ccbf74..03341b7438d5 100644
--- a/Documentation/devicetree/bindings/net/asix,ax88178.yaml
+++ b/Documentation/devicetree/bindings/net/asix,ax88178.yaml
@@ -63,8 +63,8 @@ examples:
             #size-cells = <0>;
 
             ethernet@1 {
-               compatible = "usbb95,772b";
-               reg = <1>;
+                compatible = "usbb95,772b";
+                reg = <1>;
             };
         };
     };
diff --git a/Documentation/devicetree/bindings/net/brcm,bcmgenet.yaml b/Documentation/devicetree/bindings/net/brcm,bcmgenet.yaml
index 7c90a4390531..0e3fb4e42e3f 100644
--- a/Documentation/devicetree/bindings/net/brcm,bcmgenet.yaml
+++ b/Documentation/devicetree/bindings/net/brcm,bcmgenet.yaml
@@ -85,16 +85,16 @@ examples:
         #size-cells = <1>;
 
         mdio0: mdio@e14 {
-           compatible = "brcm,genet-mdio-v4";
-           #address-cells = <1>;
-           #size-cells = <0>;
-           reg = <0xe14 0x8>;
+            compatible = "brcm,genet-mdio-v4";
+            #address-cells = <1>;
+            #size-cells = <0>;
+            reg = <0xe14 0x8>;
 
-           phy1: ethernet-phy@1 {
+            phy1: ethernet-phy@1 {
                 max-speed = <1000>;
                 reg = <1>;
                 compatible = "ethernet-phy-ieee802.3-c22";
-           };
+            };
         };
     };
 
@@ -110,10 +110,10 @@ examples:
         interrupts = <0x0 0x16 0x0>, <0x0 0x17 0x0>;
 
         mdio1: mdio@e14 {
-           compatible = "brcm,genet-mdio-v4";
-           #address-cells = <1>;
-           #size-cells = <0>;
-           reg = <0xe14 0x8>;
+            compatible = "brcm,genet-mdio-v4";
+            #address-cells = <1>;
+            #size-cells = <0>;
+            reg = <0xe14 0x8>;
         };
     };
 
@@ -129,15 +129,15 @@ examples:
         interrupts = <0x0 0x18 0x0>, <0x0 0x19 0x0>;
 
         mdio2: mdio@e14 {
-           compatible = "brcm,genet-mdio-v4";
-           #address-cells = <1>;
-           #size-cells = <0>;
-           reg = <0xe14 0x8>;
+            compatible = "brcm,genet-mdio-v4";
+            #address-cells = <1>;
+            #size-cells = <0>;
+            reg = <0xe14 0x8>;
 
-           phy0: ethernet-phy@0 {
+            phy0: ethernet-phy@0 {
                 max-speed = <1000>;
                 reg = <0>;
                 compatible = "ethernet-phy-ieee802.3-c22";
-           };
+            };
         };
     };
diff --git a/Documentation/devicetree/bindings/net/brcm,mdio-mux-iproc.yaml b/Documentation/devicetree/bindings/net/brcm,mdio-mux-iproc.yaml
index af96b4fd89d5..3f27746d9a56 100644
--- a/Documentation/devicetree/bindings/net/brcm,mdio-mux-iproc.yaml
+++ b/Documentation/devicetree/bindings/net/brcm,mdio-mux-iproc.yaml
@@ -38,43 +38,43 @@ unevaluatedProperties: false
 
 examples:
   - |
-    mdio_mux_iproc: mdio-mux@66020000 {
+    mdio-mux@66020000 {
         compatible = "brcm,mdio-mux-iproc";
         reg = <0x66020000 0x250>;
         #address-cells = <1>;
         #size-cells = <0>;
 
         mdio@0 {
-           reg = <0x0>;
-           #address-cells = <1>;
-           #size-cells = <0>;
-
-           pci_phy0: pci-phy@0 {
-              compatible = "brcm,ns2-pcie-phy";
-              reg = <0x0>;
-              #phy-cells = <0>;
-           };
+            reg = <0x0>;
+            #address-cells = <1>;
+            #size-cells = <0>;
+
+            pci-phy@0 {
+                compatible = "brcm,ns2-pcie-phy";
+                reg = <0x0>;
+                #phy-cells = <0>;
+            };
         };
 
         mdio@7 {
-           reg = <0x7>;
-           #address-cells = <1>;
-           #size-cells = <0>;
-
-           pci_phy1: pci-phy@0 {
-              compatible = "brcm,ns2-pcie-phy";
-              reg = <0x0>;
-              #phy-cells = <0>;
-           };
+            reg = <0x7>;
+            #address-cells = <1>;
+            #size-cells = <0>;
+
+            pci-phy@0 {
+                compatible = "brcm,ns2-pcie-phy";
+                reg = <0x0>;
+                #phy-cells = <0>;
+            };
         };
 
         mdio@10 {
-           reg = <0x10>;
-           #address-cells = <1>;
-           #size-cells = <0>;
+            reg = <0x10>;
+            #address-cells = <1>;
+            #size-cells = <0>;
 
-           gphy0: eth-phy@10 {
-              reg = <0x10>;
-           };
+            eth-phy@10 {
+                reg = <0x10>;
+            };
         };
     };
diff --git a/Documentation/devicetree/bindings/net/can/atmel,at91sam9263-can.yaml b/Documentation/devicetree/bindings/net/can/atmel,at91sam9263-can.yaml
new file mode 100644
index 000000000000..c818c01a718b
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/can/atmel,at91sam9263-can.yaml
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/can/atmel,at91sam9263-can.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Microchip AT91 CAN Controller
+
+maintainers:
+  - Nicolas Ferre <nicolas.ferre@microchip.com>
+
+allOf:
+  - $ref: can-controller.yaml#
+
+properties:
+  compatible:
+    oneOf:
+      - enum:
+          - atmel,at91sam9263-can
+          - atmel,at91sam9x5-can
+      - items:
+          - enum:
+              - microchip,sam9x60-can
+          - const: atmel,at91sam9x5-can
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  clocks:
+    maxItems: 1
+
+  clock-names:
+    items:
+      - const: can_clk
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - clocks
+  - clock-names
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/irq.h>
+    #include <dt-bindings/clock/at91.h>
+    can@f000c000 {
+          compatible = "atmel,at91sam9263-can";
+          reg = <0xf000c000 0x300>;
+          interrupts = <30 IRQ_TYPE_LEVEL_HIGH 3>;
+          clocks = <&pmc PMC_TYPE_PERIPHERAL 12>;
+          clock-names = "can_clk";
+    };
diff --git a/Documentation/devicetree/bindings/net/can/atmel-can.txt b/Documentation/devicetree/bindings/net/can/atmel-can.txt
deleted file mode 100644
index 218a3b3eb27e..000000000000
--- a/Documentation/devicetree/bindings/net/can/atmel-can.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-* AT91 CAN *
-
-Required properties:
-  - compatible: Should be "atmel,at91sam9263-can", "atmel,at91sam9x5-can" or
-    "microchip,sam9x60-can"
-  - reg: Should contain CAN controller registers location and length
-  - interrupts: Should contain IRQ line for the CAN controller
-
-Example:
-
-	can0: can@f000c000 {
-		compatible = "atmel,at91sam9x5-can";
-		reg = <0xf000c000 0x300>;
-		interrupts = <40 4 5>
-	};
diff --git a/Documentation/devicetree/bindings/net/can/bosch,c_can.yaml b/Documentation/devicetree/bindings/net/can/bosch,c_can.yaml
index 4d7d67ee175a..ff1b59a0294e 100644
--- a/Documentation/devicetree/bindings/net/can/bosch,c_can.yaml
+++ b/Documentation/devicetree/bindings/net/can/bosch,c_can.yaml
@@ -99,11 +99,11 @@ examples:
     #include <dt-bindings/reset/altr,rst-mgr.h>
 
     can@ffc00000 {
-       compatible = "bosch,d_can";
-       reg = <0xffc00000 0x1000>;
-       interrupts = <0 131 4>, <0 132 4>, <0 133 4>, <0 134 4>;
-       clocks = <&can0_clk>;
-       resets = <&rst CAN0_RESET>;
+        compatible = "bosch,d_can";
+        reg = <0xffc00000 0x1000>;
+        interrupts = <0 131 4>, <0 132 4>, <0 133 4>, <0 134 4>;
+        clocks = <&can0_clk>;
+        resets = <&rst CAN0_RESET>;
     };
   - |
     can@0 {
diff --git a/Documentation/devicetree/bindings/net/can/microchip,mcp2510.yaml b/Documentation/devicetree/bindings/net/can/microchip,mcp2510.yaml
index db446dde6842..e0ec53bc10c6 100644
--- a/Documentation/devicetree/bindings/net/can/microchip,mcp2510.yaml
+++ b/Documentation/devicetree/bindings/net/can/microchip,mcp2510.yaml
@@ -56,15 +56,15 @@ examples:
         #size-cells = <0>;
 
         can@1 {
-             compatible = "microchip,mcp2515";
-             reg = <1>;
-             clocks = <&clk24m>;
-             interrupt-parent = <&gpio4>;
-             interrupts = <13 IRQ_TYPE_LEVEL_LOW>;
-             vdd-supply = <&reg5v0>;
-             xceiver-supply = <&reg5v0>;
-             gpio-controller;
-             #gpio-cells = <2>;
+            compatible = "microchip,mcp2515";
+            reg = <1>;
+            clocks = <&clk24m>;
+            interrupt-parent = <&gpio4>;
+            interrupts = <13 IRQ_TYPE_LEVEL_LOW>;
+            vdd-supply = <&reg5v0>;
+            xceiver-supply = <&reg5v0>;
+            gpio-controller;
+            #gpio-cells = <2>;
         };
     };
 
diff --git a/Documentation/devicetree/bindings/net/can/microchip,mpfs-can.yaml b/Documentation/devicetree/bindings/net/can/microchip,mpfs-can.yaml
index 01e4d4a54df6..1219c5cb601f 100644
--- a/Documentation/devicetree/bindings/net/can/microchip,mpfs-can.yaml
+++ b/Documentation/devicetree/bindings/net/can/microchip,mpfs-can.yaml
@@ -15,7 +15,11 @@ allOf:
 
 properties:
   compatible:
-    const: microchip,mpfs-can
+    oneOf:
+      - items:
+          - const: microchip,pic64gx-can
+          - const: microchip,mpfs-can
+      - const: microchip,mpfs-can
 
   reg:
     maxItems: 1
diff --git a/Documentation/devicetree/bindings/net/can/st,stm32-bxcan.yaml b/Documentation/devicetree/bindings/net/can/st,stm32-bxcan.yaml
index de1d4298893b..c7510b00954a 100644
--- a/Documentation/devicetree/bindings/net/can/st,stm32-bxcan.yaml
+++ b/Documentation/devicetree/bindings/net/can/st,stm32-bxcan.yaml
@@ -63,7 +63,7 @@ properties:
     maxItems: 1
 
   st,gcan:
-    $ref: /schemas/types.yaml#/definitions/phandle-array
+    $ref: /schemas/types.yaml#/definitions/phandle
     description:
       The phandle to the gcan node which allows to access the 512-bytes
       SRAM memory shared by the two bxCAN cells (CAN1 primary and CAN2
diff --git a/Documentation/devicetree/bindings/net/can/tcan4x5x.txt b/Documentation/devicetree/bindings/net/can/tcan4x5x.txt
deleted file mode 100644
index 20c0572c9853..000000000000
--- a/Documentation/devicetree/bindings/net/can/tcan4x5x.txt
+++ /dev/null
@@ -1,48 +0,0 @@
-Texas Instruments TCAN4x5x CAN Controller
-================================================
-
-This file provides device node information for the TCAN4x5x interface contains.
-
-Required properties:
-	- compatible:
-		"ti,tcan4552", "ti,tcan4x5x"
-		"ti,tcan4553", "ti,tcan4x5x" or
-		"ti,tcan4x5x"
-	- reg: 0
-	- #address-cells: 1
-	- #size-cells: 0
-	- spi-max-frequency: Maximum frequency of the SPI bus the chip can
-			     operate at should be less than or equal to 18 MHz.
-	- interrupt-parent: the phandle to the interrupt controller which provides
-                    the interrupt.
-	- interrupts: interrupt specification for data-ready.
-
-See Documentation/devicetree/bindings/net/can/bosch,m_can.yaml for additional
-required property details.
-
-Optional properties:
-	- reset-gpios: Hardwired output GPIO. If not defined then software
-		       reset.
-	- device-state-gpios: Input GPIO that indicates if the device is in
-			      a sleep state or if the device is active. Not
-			      available with tcan4552/4553.
-	- device-wake-gpios: Wake up GPIO to wake up the TCAN device. Not
-			     available with tcan4552/4553.
-	- wakeup-source: Leave the chip running when suspended, and configure
-			 the RX interrupt to wake up the device.
-
-Example:
-tcan4x5x: tcan4x5x@0 {
-		compatible = "ti,tcan4x5x";
-		reg = <0>;
-		#address-cells = <1>;
-		#size-cells = <1>;
-		spi-max-frequency = <10000000>;
-		bosch,mram-cfg = <0x0 0 0 16 0 0 1 1>;
-		interrupt-parent = <&gpio1>;
-		interrupts = <14 IRQ_TYPE_LEVEL_LOW>;
-		device-state-gpios = <&gpio3 21 GPIO_ACTIVE_HIGH>;
-		device-wake-gpios = <&gpio1 15 GPIO_ACTIVE_HIGH>;
-		reset-gpios = <&gpio1 27 GPIO_ACTIVE_HIGH>;
-		wakeup-source;
-};
diff --git a/Documentation/devicetree/bindings/net/can/ti,tcan4x5x.yaml b/Documentation/devicetree/bindings/net/can/ti,tcan4x5x.yaml
new file mode 100644
index 000000000000..384e15da2713
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/can/ti,tcan4x5x.yaml
@@ -0,0 +1,199 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/can/ti,tcan4x5x.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Texas Instruments TCAN4x5x CAN Controller
+
+maintainers:
+  - Marc Kleine-Budde <mkl@pengutronix.de>
+
+properties:
+  compatible:
+    oneOf:
+      - items:
+          - enum:
+              - ti,tcan4552
+              - ti,tcan4553
+          - const: ti,tcan4x5x
+      - const: ti,tcan4x5x
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+    description: The GPIO parent interrupt.
+
+  clocks:
+    maxItems: 1
+
+  clock-names:
+    items:
+      - const: cclk
+
+  reset-gpios:
+    description: Hardwired output GPIO. If not defined then software reset.
+    maxItems: 1
+
+  device-state-gpios:
+    description:
+      Input GPIO that indicates if the device is in a sleep state or if the
+      device is active. Not available with tcan4552/4553.
+    maxItems: 1
+
+  device-wake-gpios:
+    description:
+      Wake up GPIO to wake up the TCAN device.
+      Not available with tcan4552/4553.
+    maxItems: 1
+
+  bosch,mram-cfg:
+    description: |
+      Message RAM configuration data.
+      Multiple M_CAN instances can share the same Message RAM
+      and each element(e.g Rx FIFO or Tx Buffer and etc) number
+      in Message RAM is also configurable, so this property is
+      telling driver how the shared or private Message RAM are
+      used by this M_CAN controller.
+
+      The format should be as follows:
+      <offset sidf_elems xidf_elems rxf0_elems rxf1_elems rxb_elems txe_elems txb_elems>
+      The 'offset' is an address offset of the Message RAM where
+      the following elements start from. This is usually set to
+      0x0 if you're using a private Message RAM. The remain cells
+      are used to specify how many elements are used for each FIFO/Buffer.
+
+      M_CAN includes the following elements according to user manual:
+      11-bit Filter	0-128 elements / 0-128 words
+      29-bit Filter	0-64 elements / 0-128 words
+      Rx FIFO 0		0-64 elements / 0-1152 words
+      Rx FIFO 1		0-64 elements / 0-1152 words
+      Rx Buffers	0-64 elements / 0-1152 words
+      Tx Event FIFO	0-32 elements / 0-64 words
+      Tx Buffers	0-32 elements / 0-576 words
+
+      Please refer to 2.4.1 Message RAM Configuration in Bosch
+      M_CAN user manual for details.
+    $ref: /schemas/types.yaml#/definitions/int32-array
+    items:
+      - description: The 'offset' is an address offset of the Message RAM where
+          the following elements start from. This is usually set to 0x0 if
+          you're using a private Message RAM.
+        default: 0
+      - description: 11-bit Filter 0-128 elements / 0-128 words
+        minimum: 0
+        maximum: 128
+      - description: 29-bit Filter 0-64 elements / 0-128 words
+        minimum: 0
+        maximum: 64
+      - description: Rx FIFO 0 0-64 elements / 0-1152 words
+        minimum: 0
+        maximum: 64
+      - description: Rx FIFO 1 0-64 elements / 0-1152 words
+        minimum: 0
+        maximum: 64
+      - description: Rx Buffers 0-64 elements / 0-1152 words
+        minimum: 0
+        maximum: 64
+      - description: Tx Event FIFO 0-32 elements / 0-64 words
+        minimum: 0
+        maximum: 32
+      - description: Tx Buffers 0-32 elements / 0-576 words
+        minimum: 0
+        maximum: 32
+    minItems: 1
+
+  spi-max-frequency:
+    description:
+      Must be half or less of "clocks" frequency.
+    maximum: 18000000
+
+  ti,nwkrq-voltage-vio:
+    type: boolean
+    description:
+      nWKRQ Pin GPO buffer voltage configuration.
+      Set nWKRQ to use VIO voltage rail.
+      When not set nWKRQ will use internal voltage rail.
+
+  wakeup-source:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      Enable CAN remote wakeup.
+
+allOf:
+  - $ref: can-controller.yaml#
+  - $ref: /schemas/spi/spi-peripheral-props.yaml#
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - ti,tcan4552
+              - ti,tcan4553
+    then:
+      properties:
+        device-state-gpios: false
+        device-wake-gpios: false
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - clocks
+  - clock-names
+  - bosch,mram-cfg
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/gpio/gpio.h>
+    #include <dt-bindings/interrupt-controller/irq.h>
+
+    spi {
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        can@0 {
+            compatible = "ti,tcan4x5x";
+            reg = <0>;
+            clocks = <&can0_osc>;
+            clock-names = "cclk";
+            pinctrl-names = "default";
+            pinctrl-0 = <&can0_pins>;
+            spi-max-frequency = <10000000>;
+            bosch,mram-cfg = <0x0 0 0 16 0 0 1 1>;
+            interrupt-parent = <&gpio1>;
+            interrupts = <14 IRQ_TYPE_LEVEL_LOW>;
+            device-state-gpios = <&gpio3 21 GPIO_ACTIVE_HIGH>;
+            device-wake-gpios = <&gpio1 15 GPIO_ACTIVE_HIGH>;
+            reset-gpios = <&gpio1 27 GPIO_ACTIVE_HIGH>;
+            ti,nwkrq-voltage-vio;
+            wakeup-source;
+        };
+    };
+  - |
+    #include <dt-bindings/gpio/gpio.h>
+    #include <dt-bindings/interrupt-controller/irq.h>
+
+    spi {
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        can@0 {
+            compatible = "ti,tcan4552", "ti,tcan4x5x";
+            reg = <0>;
+            clocks = <&can0_osc>;
+            clock-names = "cclk";
+            pinctrl-names = "default";
+            pinctrl-0 = <&can0_pins>;
+            spi-max-frequency = <10000000>;
+            bosch,mram-cfg = <0x0 0 0 16 0 0 1 1>;
+            interrupt-parent = <&gpio1>;
+            interrupts = <14 IRQ_TYPE_LEVEL_LOW>;
+            reset-gpios = <&gpio1 27 GPIO_ACTIVE_HIGH>;
+            wakeup-source;
+        };
+    };
diff --git a/Documentation/devicetree/bindings/net/pse-pd/pse-controller.yaml b/Documentation/devicetree/bindings/net/pse-pd/pse-controller.yaml
index a12cda8aa764..cd09560e0aea 100644
--- a/Documentation/devicetree/bindings/net/pse-pd/pse-controller.yaml
+++ b/Documentation/devicetree/bindings/net/pse-pd/pse-controller.yaml
@@ -81,7 +81,7 @@ properties:
               List of phandles, each pointing to the power supply for the
               corresponding pairset named in 'pairset-names'. This property
               aligns with IEEE 802.3-2022, Section 33.2.3 and 145.2.4.
-              PSE Pinout Alternatives (as per IEEE 802.3-2022 Table 145\u20133)
+              PSE Pinout Alternatives (as per IEEE 802.3-2022 Table 145-3)
               |-----------|---------------|---------------|---------------|---------------|
               | Conductor | Alternative A | Alternative A | Alternative B | Alternative B |
               |           |    (MDI-X)    |     (MDI)     |      (X)      |      (S)      |
diff --git a/Documentation/devicetree/bindings/net/qcom,ipa.yaml b/Documentation/devicetree/bindings/net/qcom,ipa.yaml
index 53cae71d9957..1a46d80a66e8 100644
--- a/Documentation/devicetree/bindings/net/qcom,ipa.yaml
+++ b/Documentation/devicetree/bindings/net/qcom,ipa.yaml
@@ -239,7 +239,7 @@ examples:
 
                 qcom,gsi-loader = "self";
                 memory-region = <&ipa_fw_mem>;
-                firmware-name = "qcom/sc7180-trogdor/modem/modem.mdt";
+                firmware-name = "qcom/sc7180-trogdor/modem/modem.mbn";
 
                 iommus = <&apps_smmu 0x440 0x0>,
                          <&apps_smmu 0x442 0x0>;
diff --git a/Documentation/devicetree/bindings/net/stm32-dwmac.yaml b/Documentation/devicetree/bindings/net/stm32-dwmac.yaml
index bf23838fe6e8..85cea9966a27 100644
--- a/Documentation/devicetree/bindings/net/stm32-dwmac.yaml
+++ b/Documentation/devicetree/bindings/net/stm32-dwmac.yaml
@@ -154,56 +154,56 @@ examples:
     #include <dt-bindings/interrupt-controller/arm-gic.h>
     #include <dt-bindings/clock/stm32mp1-clks.h>
     //Example 1
-     ethernet0: ethernet@5800a000 {
-           compatible = "st,stm32mp1-dwmac", "snps,dwmac-4.20a";
-           reg = <0x5800a000 0x2000>;
-           reg-names = "stmmaceth";
-           interrupts = <GIC_SPI 61 IRQ_TYPE_LEVEL_HIGH>;
-           interrupt-names = "macirq";
-           clock-names = "stmmaceth",
-                     "mac-clk-tx",
-                     "mac-clk-rx",
-                     "ethstp",
-                     "eth-ck";
-           clocks = <&rcc ETHMAC>,
-                <&rcc ETHTX>,
-                <&rcc ETHRX>,
-                <&rcc ETHSTP>,
-                <&rcc ETHCK_K>;
-           st,syscon = <&syscfg 0x4>;
-           snps,pbl = <2>;
-           snps,axi-config = <&stmmac_axi_config_0>;
-           snps,tso;
-           phy-mode = "rgmii";
-       };
+    ethernet0: ethernet@5800a000 {
+        compatible = "st,stm32mp1-dwmac", "snps,dwmac-4.20a";
+        reg = <0x5800a000 0x2000>;
+        reg-names = "stmmaceth";
+        interrupts = <GIC_SPI 61 IRQ_TYPE_LEVEL_HIGH>;
+        interrupt-names = "macirq";
+        clock-names = "stmmaceth",
+                      "mac-clk-tx",
+                      "mac-clk-rx",
+                      "ethstp",
+                      "eth-ck";
+        clocks = <&rcc ETHMAC>,
+                 <&rcc ETHTX>,
+                 <&rcc ETHRX>,
+                 <&rcc ETHSTP>,
+                 <&rcc ETHCK_K>;
+        st,syscon = <&syscfg 0x4>;
+        snps,pbl = <2>;
+        snps,axi-config = <&stmmac_axi_config_0>;
+        snps,tso;
+        phy-mode = "rgmii";
+    };
 
   - |
     //Example 2 (MCU example)
-     ethernet1: ethernet@40028000 {
-           compatible = "st,stm32-dwmac", "snps,dwmac-3.50a";
-           reg = <0x40028000 0x8000>;
-           reg-names = "stmmaceth";
-           interrupts = <0 61 0>, <0 62 0>;
-           interrupt-names = "macirq", "eth_wake_irq";
-           clock-names = "stmmaceth", "mac-clk-tx", "mac-clk-rx";
-           clocks = <&rcc 0 25>, <&rcc 0 26>, <&rcc 0 27>;
-           st,syscon = <&syscfg 0x4>;
-           snps,pbl = <8>;
-           snps,mixed-burst;
-           phy-mode = "mii";
-       };
+    ethernet1: ethernet@40028000 {
+        compatible = "st,stm32-dwmac", "snps,dwmac-3.50a";
+        reg = <0x40028000 0x8000>;
+        reg-names = "stmmaceth";
+        interrupts = <0 61 0>, <0 62 0>;
+        interrupt-names = "macirq", "eth_wake_irq";
+        clock-names = "stmmaceth", "mac-clk-tx", "mac-clk-rx";
+        clocks = <&rcc 0 25>, <&rcc 0 26>, <&rcc 0 27>;
+        st,syscon = <&syscfg 0x4>;
+        snps,pbl = <8>;
+        snps,mixed-burst;
+        phy-mode = "mii";
+    };
 
   - |
     //Example 3
-     ethernet2: ethernet@40027000 {
-           compatible = "st,stm32-dwmac", "snps,dwmac-4.10a";
-           reg = <0x40028000 0x8000>;
-           reg-names = "stmmaceth";
-           interrupts = <61>;
-           interrupt-names = "macirq";
-           clock-names = "stmmaceth", "mac-clk-tx", "mac-clk-rx";
-           clocks = <&rcc 62>, <&rcc 61>, <&rcc 60>;
-           st,syscon = <&syscfg 0x4>;
-           snps,pbl = <8>;
-           phy-mode = "mii";
-       };
+    ethernet2: ethernet@40027000 {
+        compatible = "st,stm32-dwmac", "snps,dwmac-4.10a";
+        reg = <0x40028000 0x8000>;
+        reg-names = "stmmaceth";
+        interrupts = <61>;
+        interrupt-names = "macirq";
+        clock-names = "stmmaceth", "mac-clk-tx", "mac-clk-rx";
+        clocks = <&rcc 62>, <&rcc 61>, <&rcc 60>;
+        st,syscon = <&syscfg 0x4>;
+        snps,pbl = <8>;
+        phy-mode = "mii";
+    };
diff --git a/Documentation/devicetree/bindings/net/ti,davinci-mdio.yaml b/Documentation/devicetree/bindings/net/ti,davinci-mdio.yaml
index 53604fab0b73..08119b6880ee 100644
--- a/Documentation/devicetree/bindings/net/ti,davinci-mdio.yaml
+++ b/Documentation/devicetree/bindings/net/ti,davinci-mdio.yaml
@@ -72,9 +72,9 @@ unevaluatedProperties: false
 examples:
   - |
     davinci_mdio: mdio@4a101000 {
-         compatible = "ti,davinci_mdio";
-         #address-cells = <1>;
-         #size-cells = <0>;
-         reg = <0x4a101000 0x1000>;
-         bus_freq = <1000000>;
+        compatible = "ti,davinci_mdio";
+        #address-cells = <1>;
+        #size-cells = <0>;
+        reg = <0x4a101000 0x1000>;
+        bus_freq = <1000000>;
     };
diff --git a/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml b/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml
index 02b6d32003cc..b11894fbaec4 100644
--- a/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml
+++ b/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml
@@ -302,16 +302,16 @@ examples:
                     ti,fifo-depth = <DP83867_PHYCR_FIFO_DEPTH_4_B_NIB>;
                 };
             };
-        };
 
-        cpts@3d000 {
-             compatible = "ti,am65-cpts";
-             reg = <0x0 0x3d000 0x0 0x400>;
-             clocks = <&k3_clks 18 2>;
-             clock-names = "cpts";
-             interrupts-extended = <&gic500 GIC_SPI 858 IRQ_TYPE_LEVEL_HIGH>;
-             interrupt-names = "cpts";
-             ti,cpts-ext-ts-inputs = <4>;
-             ti,cpts-periodic-outputs = <2>;
+            cpts@3d000 {
+                compatible = "ti,am65-cpts";
+                reg = <0x0 0x3d000 0x0 0x400>;
+                clocks = <&k3_clks 18 2>;
+                clock-names = "cpts";
+                interrupts-extended = <&gic500 GIC_SPI 858 IRQ_TYPE_LEVEL_HIGH>;
+                interrupt-names = "cpts";
+                ti,cpts-ext-ts-inputs = <4>;
+                ti,cpts-periodic-outputs = <2>;
+            };
         };
     };
diff --git a/Documentation/devicetree/bindings/net/ti,k3-am654-cpts.yaml b/Documentation/devicetree/bindings/net/ti,k3-am654-cpts.yaml
index 3888692275ad..3572749147fb 100644
--- a/Documentation/devicetree/bindings/net/ti,k3-am654-cpts.yaml
+++ b/Documentation/devicetree/bindings/net/ti,k3-am654-cpts.yaml
@@ -131,23 +131,23 @@ examples:
     #include <dt-bindings/interrupt-controller/arm-gic.h>
 
     cpts@310d0000 {
-         compatible = "ti,am65-cpts";
-         reg = <0x310d0000 0x400>;
-         reg-names = "cpts";
-         clocks = <&main_cpts_mux>;
-         clock-names = "cpts";
-         interrupts-extended = <&k3_irq 163 0 IRQ_TYPE_LEVEL_HIGH>;
-         interrupt-names = "cpts";
-         ti,cpts-periodic-outputs = <6>;
-         ti,cpts-ext-ts-inputs = <8>;
-
-         main_cpts_mux: refclk-mux {
-               #clock-cells = <0>;
-               clocks = <&k3_clks 118 5>, <&k3_clks 118 11>,
-                        <&k3_clks 157 91>, <&k3_clks 157 77>,
-                        <&k3_clks 157 102>, <&k3_clks 157 80>,
-                        <&k3_clks 120 3>, <&k3_clks 121 3>;
-               assigned-clocks = <&main_cpts_mux>;
-               assigned-clock-parents = <&k3_clks 118 11>;
-         };
+        compatible = "ti,am65-cpts";
+        reg = <0x310d0000 0x400>;
+        reg-names = "cpts";
+        clocks = <&main_cpts_mux>;
+        clock-names = "cpts";
+        interrupts-extended = <&k3_irq 163 0 IRQ_TYPE_LEVEL_HIGH>;
+        interrupt-names = "cpts";
+        ti,cpts-periodic-outputs = <6>;
+        ti,cpts-ext-ts-inputs = <8>;
+
+        main_cpts_mux: refclk-mux {
+            #clock-cells = <0>;
+            clocks = <&k3_clks 118 5>, <&k3_clks 118 11>,
+                     <&k3_clks 157 91>, <&k3_clks 157 77>,
+                     <&k3_clks 157 102>, <&k3_clks 157 80>,
+                     <&k3_clks 120 3>, <&k3_clks 121 3>;
+            assigned-clocks = <&main_cpts_mux>;
+            assigned-clock-parents = <&k3_clks 118 11>;
+        };
     };
diff --git a/Documentation/devicetree/bindings/net/wireless/marvell,sd8787.yaml b/Documentation/devicetree/bindings/net/wireless/marvell,sd8787.yaml
index 1715b22e0dcf..930b700b73d0 100644
--- a/Documentation/devicetree/bindings/net/wireless/marvell,sd8787.yaml
+++ b/Documentation/devicetree/bindings/net/wireless/marvell,sd8787.yaml
@@ -79,15 +79,14 @@ examples:
     #include <dt-bindings/interrupt-controller/irq.h>
 
     mmc {
-         #address-cells = <1>;
-         #size-cells = <0>;
-
-         wifi@1 {
-             compatible = "marvell,sd8897";
-             reg = <1>;
-             interrupt-parent = <&pio>;
-             interrupts = <38 IRQ_TYPE_LEVEL_LOW>;
-             marvell,wakeup-pin = <3>;
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        wifi@1 {
+            compatible = "marvell,sd8897";
+            reg = <1>;
+            interrupt-parent = <&pio>;
+            interrupts = <38 IRQ_TYPE_LEVEL_LOW>;
+            marvell,wakeup-pin = <3>;
         };
     };
-
diff --git a/Documentation/netlink/specs/rt_link.yaml b/Documentation/netlink/specs/rt_link.yaml
index 96465376d6fe..0d492500c7e5 100644
--- a/Documentation/netlink/specs/rt_link.yaml
+++ b/Documentation/netlink/specs/rt_link.yaml
@@ -1826,6 +1826,48 @@ attribute-sets:
         name: erspan-hwid
         type: u16
   -
+    name: linkinfo-vti-attrs
+    name-prefix: ifla-vti-
+    attributes:
+      -
+        name: link
+        type: u32
+      -
+        name: ikey
+        type: u32
+      -
+        name: okey
+        type: u32
+      -
+        name: local
+        type: binary
+        display-hint: ipv4
+      -
+        name: remote
+        type: binary
+        display-hint: ipv4
+      -
+        name: fwmark
+        type: u32
+  -
+    name: linkinfo-vti6-attrs
+    subset-of: linkinfo-vti-attrs
+    attributes:
+      -
+        name: link
+      -
+        name: ikey
+      -
+        name: okey
+      -
+        name: local
+        display-hint: ipv6
+      -
+        name: remote
+        display-hint: ipv6
+      -
+        name: fwmark
+  -
     name: linkinfo-geneve-attrs
     name-prefix: ifla-geneve-
     attributes:
@@ -1942,6 +1984,42 @@ attribute-sets:
         name: fwmark
         type: u32
   -
+    name: linkinfo-ip6tnl-attrs
+    subset-of: linkinfo-iptun-attrs
+    attributes:
+      -
+        name: link
+      -
+        name: local
+        display-hint: ipv6
+      -
+        name: remote
+        display-hint: ipv6
+      -
+        name: ttl
+      -
+        name: encap-limit
+      -
+        name: flowinfo
+      -
+        name: flags
+        # ip6tnl unlike ipip and sit has 32b flags
+        type: u32
+      -
+        name: proto
+      -
+        name: encap-type
+      -
+        name: encap-flags
+      -
+        name: encap-sport
+      -
+        name: encap-dport
+      -
+        name: collect-metadata
+      -
+        name: fwmark
+  -
     name: linkinfo-tun-attrs
     name-prefix: ifla-tun-
     attributes:
@@ -2169,6 +2247,12 @@ attribute-sets:
         name: peer-scrub
         type: u32
         enum: netkit-scrub
+      -
+        name: headroom
+        type: u16
+      -
+        name: tailroom
+        type: u16
 
 sub-messages:
   -
@@ -2196,6 +2280,9 @@ sub-messages:
         value: ipip
         attribute-set: linkinfo-iptun-attrs
       -
+        value: ip6tnl
+        attribute-set: linkinfo-ip6tnl-attrs
+      -
         value: sit
         attribute-set: linkinfo-iptun-attrs
       -
@@ -2208,6 +2295,12 @@ sub-messages:
         value: vrf
         attribute-set: linkinfo-vrf-attrs
       -
+        value: vti
+        attribute-set: linkinfo-vti-attrs
+      -
+        value: vti6
+        attribute-set: linkinfo-vti6-attrs
+      -
         value: netkit
         attribute-set: linkinfo-netkit-attrs
   -
diff --git a/Documentation/networking/device_drivers/ethernet/intel/i40e.rst b/Documentation/networking/device_drivers/ethernet/intel/i40e.rst
index 4fbaa1a2d674..53d9d5829d69 100644
--- a/Documentation/networking/device_drivers/ethernet/intel/i40e.rst
+++ b/Documentation/networking/device_drivers/ethernet/intel/i40e.rst
@@ -299,6 +299,18 @@ Use ethtool to view and set link-down-on-close, as follows::
   ethtool --show-priv-flags ethX
   ethtool --set-priv-flags ethX link-down-on-close [on|off]
 
+Setting the mdd-auto-reset-vf Private Flag
+------------------------------------------
+
+When the mdd-auto-reset-vf private flag is set to "on", the problematic VF will
+be automatically reset if a malformed descriptor is detected. If the flag is
+set to "off", the problematic VF will be disabled.
+
+Use ethtool to view and set mdd-auto-reset-vf, as follows::
+
+  ethtool --show-priv-flags ethX
+  ethtool --set-priv-flags ethX mdd-auto-reset-vf [on|off]
+
 Viewing Link Messages
 ---------------------
 Link messages will not be displayed to the console if the distribution is
diff --git a/Documentation/networking/devlink/mlx5.rst b/Documentation/networking/devlink/mlx5.rst
index 456985407475..41618538fc70 100644
--- a/Documentation/networking/devlink/mlx5.rst
+++ b/Documentation/networking/devlink/mlx5.rst
@@ -53,6 +53,9 @@ parameters.
        * ``smfs`` Software managed flow steering. In SMFS mode, the HW
          steering entities are created and manage through the driver without
          firmware intervention.
+       * ``hmfs`` Hardware managed flow steering. In HMFS mode, the driver
+         is configuring steering rules directly to the HW using Work Queues with
+         a special new type of WQE (Work Queue Element).
 
        SMFS mode is faster and provides better rule insertion rate compared to
        default DMFS mode.
diff --git a/Documentation/networking/diagnostic/twisted_pair_layer1_diagnostics.rst b/Documentation/networking/diagnostic/twisted_pair_layer1_diagnostics.rst
index c9be5cc7e113..079e17effadf 100644
--- a/Documentation/networking/diagnostic/twisted_pair_layer1_diagnostics.rst
+++ b/Documentation/networking/diagnostic/twisted_pair_layer1_diagnostics.rst
@@ -713,17 +713,23 @@ driver supports reporting such events.
 
 - **Monitor Error Counters**:
 
-  - While some NIC drivers and PHYs provide error counters, there is no unified
-    set of PHY-specific counters across all hardware. Additionally, not all
-    PHYs provide useful information related to errors like CRC errors, frame
-    drops, or link flaps. Therefore, this step is dependent on the specific
-    hardware and driver support.
-
-  - **Next Steps**: Use `ethtool -S <interface>` to check if your driver
-    provides useful error counters. In some cases, counters may provide
-    information about errors like link flaps or physical layer problems (e.g.,
-    excessive CRC errors), but results can vary significantly depending on the
-    PHY.
+  - Use `ethtool -S <interface> --all-groups` to retrieve standardized interface
+    statistics if the driver supports the unified interface:
+
+  - **Command:** `ethtool -S <interface> --all-groups`
+
+  - **Example Output (if supported)**:
+
+    .. code-block:: bash
+
+      phydev-RxFrames: 100391
+      phydev-RxErrors: 0
+      phydev-TxFrames: 9
+      phydev-TxErrors: 0
+
+  - If the unified interface is not supported, use `ethtool -S <interface>` to
+    retrieve MAC and PHY counters. Note that non-standardized PHY counter names
+    vary by driver and must be interpreted accordingly:
 
   - **Command:** `ethtool -S <interface>`
 
@@ -740,6 +746,17 @@ driver supports reporting such events.
     condition) or kernel log messages (e.g., link up/down events) to further
     diagnose the issue.
 
+  - **Compare Counters**:
+
+    - Compare the egress and ingress frame counts reported by the PHY and MAC.
+
+    - A small difference may occur due to sampling rate differences between the
+      MAC and PHY drivers, or if the PHY and MAC are not always fully
+      synchronized in their UP or DOWN states.
+
+    - Significant discrepancies indicate potential issues in the data path
+      between the MAC and PHY.
+
 When All Else Fails...
 ~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index a7ba6368a4d5..da846f1d998e 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -1616,6 +1616,7 @@ the ``ETHTOOL_A_STATS_GROUPS`` bitset. Currently defined values are:
  ETHTOOL_STATS_ETH_PHY  eth-phy  Basic IEEE 802.3 PHY statistics (30.3.2.1.*)
  ETHTOOL_STATS_ETH_CTRL eth-ctrl Basic IEEE 802.3 MAC Ctrl statistics (30.3.3.*)
  ETHTOOL_STATS_RMON     rmon     RMON (RFC 2819) statistics
+ ETHTOOL_STATS_PHY      phy      Additional PHY statistics, not defined by IEEE
  ====================== ======== ===============================================
 
 Each group should have a corresponding ``ETHTOOL_A_STATS_GRP`` in the reply.
diff --git a/Documentation/networking/multi-pf-netdev.rst b/Documentation/networking/multi-pf-netdev.rst
index 2cd25d81aaa7..2f5a5bb3ca9a 100644
--- a/Documentation/networking/multi-pf-netdev.rst
+++ b/Documentation/networking/multi-pf-netdev.rst
@@ -89,7 +89,7 @@ Observability
 =============
 The relation between PF, irq, napi, and queue can be observed via netlink spec::
 
-  $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml --dump queue-get --json='{"ifindex": 13}'
+  $ ./tools/net/ynl/pyynl/cli.py --spec Documentation/netlink/specs/netdev.yaml --dump queue-get --json='{"ifindex": 13}'
   [{'id': 0, 'ifindex': 13, 'napi-id': 539, 'type': 'rx'},
    {'id': 1, 'ifindex': 13, 'napi-id': 540, 'type': 'rx'},
    {'id': 2, 'ifindex': 13, 'napi-id': 541, 'type': 'rx'},
@@ -101,7 +101,7 @@ The relation between PF, irq, napi, and queue can be observed via netlink spec::
    {'id': 3, 'ifindex': 13, 'napi-id': 542, 'type': 'tx'},
    {'id': 4, 'ifindex': 13, 'napi-id': 543, 'type': 'tx'}]
 
-  $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml --dump napi-get --json='{"ifindex": 13}'
+  $ ./tools/net/ynl/pyynl/cli.py --spec Documentation/netlink/specs/netdev.yaml --dump napi-get --json='{"ifindex": 13}'
   [{'id': 543, 'ifindex': 13, 'irq': 42},
    {'id': 542, 'ifindex': 13, 'irq': 41},
    {'id': 541, 'ifindex': 13, 'irq': 40},
diff --git a/Documentation/networking/napi.rst b/Documentation/networking/napi.rst
index 02720dd71a76..6083210ab2a4 100644
--- a/Documentation/networking/napi.rst
+++ b/Documentation/networking/napi.rst
@@ -199,13 +199,13 @@ parameters mentioned above use hyphens instead of underscores:
 
 Per-NAPI configuration can be done programmatically in a user application
 or by using a script included in the kernel source tree:
-``tools/net/ynl/cli.py``.
+``tools/net/ynl/pyynl/cli.py``.
 
 For example, using the script:
 
 .. code-block:: bash
 
-  $ kernel-source/tools/net/ynl/cli.py \
+  $ kernel-source/tools/net/ynl/pyynl/cli.py \
             --spec Documentation/netlink/specs/netdev.yaml \
             --do napi-set \
             --json='{"id": 345,
diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst
index 857c9784f87e..1d37038e9fbe 100644
--- a/Documentation/networking/netdevices.rst
+++ b/Documentation/networking/netdevices.rst
@@ -297,3 +297,13 @@ napi->poll:
 	Context:
 		 softirq
 		 will be called with interrupts disabled by netconsole.
+
+NETDEV_INTERNAL symbol namespace
+================================
+
+Symbols exported as NETDEV_INTERNAL can only be used in networking
+core and drivers which exclusively flow via the main networking list and trees.
+Note that the inverse is not true, most symbols outside of NETDEV_INTERNAL
+are not expected to be used by random code outside netdev either.
+Symbols may lack the designation because they predate the namespaces,
+or simply due to an oversight.
diff --git a/Documentation/networking/netlink_spec/readme.txt b/Documentation/networking/netlink_spec/readme.txt
index 6763f99d216c..030b44aca4e6 100644
--- a/Documentation/networking/netlink_spec/readme.txt
+++ b/Documentation/networking/netlink_spec/readme.txt
@@ -1,4 +1,4 @@
 SPDX-License-Identifier: GPL-2.0
 
 This file is populated during the build of the documentation (htmldocs) by the
-tools/net/ynl/ynl-gen-rst.py script.
+tools/net/ynl/pyynl/ynl_gen_rst.py script.
diff --git a/Documentation/networking/xfrm_device.rst b/Documentation/networking/xfrm_device.rst
index bfea9d8579ed..66f6e9a9b59a 100644
--- a/Documentation/networking/xfrm_device.rst
+++ b/Documentation/networking/xfrm_device.rst
@@ -169,7 +169,8 @@ the stack in xfrm_input().
 
 	hand the packet to napi_gro_receive() as usual
 
-In ESN mode, xdo_dev_state_advance_esn() is called from xfrm_replay_advance_esn().
+In ESN mode, xdo_dev_state_advance_esn() is called from
+xfrm_replay_advance_esn() for RX, and xfrm_replay_overflow_offload_esn for TX.
 Driver will check packet seq number and update HW ESN state machine if needed.
 
 Packet offload mode:
diff --git a/Documentation/userspace-api/netlink/intro-specs.rst b/Documentation/userspace-api/netlink/intro-specs.rst
index bada89699455..a4435ae4628d 100644
--- a/Documentation/userspace-api/netlink/intro-specs.rst
+++ b/Documentation/userspace-api/netlink/intro-specs.rst
@@ -15,7 +15,7 @@ developing Netlink related code. The tool is implemented in Python
 and can use a YAML specification to issue Netlink requests
 to the kernel. Only Generic Netlink is supported.
 
-The tool is located at ``tools/net/ynl/cli.py``. It accepts
+The tool is located at ``tools/net/ynl/pyynl/cli.py``. It accepts
 a handul of arguments, the most important ones are:
 
  - ``--spec`` - point to the spec file
@@ -27,7 +27,7 @@ YAML specs can be found under ``Documentation/netlink/specs/``.
 
 Example use::
 
-  $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/ethtool.yaml \
+  $ ./tools/net/ynl/pyynl/cli.py --spec Documentation/netlink/specs/ethtool.yaml \
         --do rings-get \
 	--json '{"header":{"dev-index": 18}}'
   {'header': {'dev-index': 18, 'dev-name': 'eni1np1'},
@@ -75,7 +75,7 @@ the two marker lines like above to a file, add that file to git,
 and run the regeneration tool. Grep the tree for ``YNL-GEN``
 to see other examples.
 
-The code generation itself is performed by ``tools/net/ynl/ynl-gen-c.py``
+The code generation itself is performed by ``tools/net/ynl/pyynl/ynl_gen_c.py``
 but it takes a few arguments so calling it directly for each file
 quickly becomes tedious.
 
@@ -84,7 +84,7 @@ YNL lib
 
 ``tools/net/ynl/lib/`` contains an implementation of a C library
 (based on libmnl) which integrates with code generated by
-``tools/net/ynl/ynl-gen-c.py`` to create easy to use netlink wrappers.
+``tools/net/ynl/pyynl/ynl_gen_c.py`` to create easy to use netlink wrappers.
 
 YNL basics
 ----------
diff --git a/MAINTAINERS b/MAINTAINERS
index a685c551faf0..797e94ba74f7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -949,7 +949,6 @@ AMAZON ETHERNET DRIVERS
 M:	Shay Agroskin <shayagr@amazon.com>
 M:	Arthur Kiyanovski <akiyano@amazon.com>
 R:	David Arinzon <darinzon@amazon.com>
-R:	Noam Dagan <ndagan@amazon.com>
 R:	Saeed Bishara <saeedb@amazon.com>
 L:	netdev@vger.kernel.org
 S:	Supported
@@ -2690,7 +2689,6 @@ N:	at91
 N:	atmel
 
 ARM/Microchip Sparx5 SoC support
-M:	Lars Povlsen <lars.povlsen@microchip.com>
 M:	Steen Hegelund <Steen.Hegelund@microchip.com>
 M:	Daniel Machon <daniel.machon@microchip.com>
 M:	UNGLinuxDriver@microchip.com
@@ -4065,7 +4063,6 @@ F:	net/bluetooth/
 
 BONDING DRIVER
 M:	Jay Vosburgh <jv@jvosburgh.net>
-M:	Andy Gospodarek <andy@greyhouse.net>
 L:	netdev@vger.kernel.org
 S:	Maintained
 F:	Documentation/networking/bonding.rst
@@ -5118,6 +5115,7 @@ F:	include/uapi/linux/can/gw.h
 F:	include/uapi/linux/can/isotp.h
 F:	include/uapi/linux/can/raw.h
 F:	net/can/
+F:	net/sched/em_canid.c
 
 CAN-J1939 NETWORK LAYER
 M:	Robin van der Gracht <robin@protonic.nl>
@@ -14574,7 +14572,6 @@ F:	drivers/dma/mediatek/
 MEDIATEK ETHERNET DRIVER
 M:	Felix Fietkau <nbd@nbd.name>
 M:	Sean Wang <sean.wang@mediatek.com>
-M:	Mark Lee <Mark-MC.Lee@mediatek.com>
 M:	Lorenzo Bianconi <lorenzo@kernel.org>
 L:	netdev@vger.kernel.org
 S:	Maintained
@@ -14764,7 +14761,7 @@ F:	drivers/memory/mtk-smi.c
 F:	include/soc/mediatek/smi.h
 
 MEDIATEK SWITCH DRIVER
-M:	Arınç ÜNAL <arinc.unal@arinc9.com>
+M:	Chester A. Unal <chester.a.unal@arinc9.com>
 M:	Daniel Golle <daniel@makrotopia.org>
 M:	DENG Qingfang <dqfext@gmail.com>
 M:	Sean Wang <sean.wang@mediatek.com>
@@ -16174,7 +16171,8 @@ M:	Breno Leitao <leitao@debian.org>
 S:	Maintained
 F:	Documentation/networking/netconsole.rst
 F:	drivers/net/netconsole.c
-F:	tools/testing/selftests/drivers/net/netcons_basic.sh
+F:	tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
+F:	tools/testing/selftests/drivers/net/netcons\*
 
 NETDEVSIM
 M:	Jakub Kicinski <kuba@kernel.org>
@@ -18469,7 +18467,7 @@ F:	Documentation/devicetree/bindings/pinctrl/mediatek,mt8183-pinctrl.yaml
 F:	drivers/pinctrl/mediatek/
 
 PIN CONTROLLER - MEDIATEK MIPS
-M:	Arınç ÜNAL <arinc.unal@arinc9.com>
+M:	Chester A. Unal <chester.a.unal@arinc9.com>
 M:	Sergio Paracuellos <sergio.paracuellos@gmail.com>
 L:	linux-mediatek@lists.infradead.org (moderated for non-subscribers)
 L:	linux-mips@vger.kernel.org
@@ -19513,7 +19511,7 @@ S:	Maintained
 F:	arch/mips/ralink
 
 RALINK MT7621 MIPS ARCHITECTURE
-M:	Arınç ÜNAL <arinc.unal@arinc9.com>
+M:	Chester A. Unal <chester.a.unal@arinc9.com>
 M:	Sergio Paracuellos <sergio.paracuellos@gmail.com>
 L:	linux-mips@vger.kernel.org
 S:	Maintained
@@ -20916,6 +20914,8 @@ F:	kernel/sched/
 SCHEDULER - SCHED_EXT
 R:	Tejun Heo <tj@kernel.org>
 R:	David Vernet <void@manifault.com>
+R:	Andrea Righi <arighi@nvidia.com>
+R:	Changwoo Min <changwoo@igalia.com>
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 W:	https://github.com/sched-ext/scx
@@ -22510,11 +22510,8 @@ F:	Documentation/devicetree/bindings/phy/st,stm32mp25-combophy.yaml
 F:	drivers/phy/st/phy-stm32-combophy.c
 
 STMMAC ETHERNET DRIVER
-M:	Alexandre Torgue <alexandre.torgue@foss.st.com>
-M:	Jose Abreu <joabreu@synopsys.com>
 L:	netdev@vger.kernel.org
-S:	Supported
-W:	http://www.stlinux.com
+S:	Orphan
 F:	Documentation/networking/device_drivers/ethernet/stmicro/
 F:	drivers/net/ethernet/stmicro/stmmac/
 
@@ -22746,9 +22743,8 @@ S:	Supported
 F:	drivers/net/ethernet/synopsys/
 
 SYNOPSYS DESIGNWARE ETHERNET XPCS DRIVER
-M:	Jose Abreu <Jose.Abreu@synopsys.com>
 L:	netdev@vger.kernel.org
-S:	Supported
+S:	Orphan
 F:	drivers/net/pcs/pcs-xpcs.c
 F:	drivers/net/pcs/pcs-xpcs.h
 F:	include/linux/pcs/pcs-xpcs.h
@@ -23656,7 +23652,6 @@ F:	tools/testing/selftests/timers/
 
 TIPC NETWORK LAYER
 M:	Jon Maloy <jmaloy@redhat.com>
-M:	Ying Xue <ying.xue@windriver.com>
 L:	netdev@vger.kernel.org (core kernel code)
 L:	tipc-discussion@lists.sourceforge.net (user apps, general discussion)
 S:	Maintained
@@ -24262,7 +24257,8 @@ F:	Documentation/devicetree/bindings/usb/nxp,isp1760.yaml
 F:	drivers/usb/isp1760/*
 
 USB LAN78XX ETHERNET DRIVER
-M:	Woojung Huh <woojung.huh@microchip.com>
+M:	Thangaraj Samynathan <Thangaraj.S@microchip.com>
+M:	Rengarajan Sundararajan <Rengarajan.S@microchip.com>
 M:	UNGLinuxDriver@microchip.com
 L:	netdev@vger.kernel.org
 S:	Maintained
diff --git a/Makefile b/Makefile
index aae2d92f7827..b11454a2233a 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 VERSION = 6
 PATCHLEVEL = 13
 SUBLEVEL = 0
-EXTRAVERSION = -rc5
+EXTRAVERSION = -rc6
 NAME = Baby Opossum Posse
 
 # *DOCUMENTATION*
diff --git a/drivers/bluetooth/btmtk.c b/drivers/bluetooth/btmtk.c
index 7fd9d5ddce02..224eafc27dbe 100644
--- a/drivers/bluetooth/btmtk.c
+++ b/drivers/bluetooth/btmtk.c
@@ -1472,10 +1472,15 @@ EXPORT_SYMBOL_GPL(btmtk_usb_setup);
 
 int btmtk_usb_shutdown(struct hci_dev *hdev)
 {
+	struct btmtk_data *data = hci_get_priv(hdev);
 	struct btmtk_hci_wmt_params wmt_params;
 	u8 param = 0;
 	int err;
 
+	err = usb_autopm_get_interface(data->intf);
+	if (err < 0)
+		return err;
+
 	/* Disable the device */
 	wmt_params.op = BTMTK_WMT_FUNC_CTRL;
 	wmt_params.flag = 0;
@@ -1486,9 +1491,11 @@ int btmtk_usb_shutdown(struct hci_dev *hdev)
 	err = btmtk_usb_hci_wmt_sync(hdev, &wmt_params);
 	if (err < 0) {
 		bt_dev_err(hdev, "Failed to send wmt func ctrl (%d)", err);
+		usb_autopm_put_interface(data->intf);
 		return err;
 	}
 
+	usb_autopm_put_interface(data->intf);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(btmtk_usb_shutdown);
diff --git a/drivers/bluetooth/btnxpuart.c b/drivers/bluetooth/btnxpuart.c
index 569f5b7d6e46..1230045d78a5 100644
--- a/drivers/bluetooth/btnxpuart.c
+++ b/drivers/bluetooth/btnxpuart.c
@@ -1381,6 +1381,7 @@ static void btnxpuart_tx_work(struct work_struct *work)
 
 	while ((skb = nxp_dequeue(nxpdev))) {
 		len = serdev_device_write_buf(serdev, skb->data, skb->len);
+		serdev_device_wait_until_sent(serdev, 0);
 		hdev->stat.byte_tx += len;
 
 		skb_pull(skb, len);
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 6a99a459b80b..51745ed1bbab 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -1106,7 +1106,7 @@ int open_for_data(struct cdrom_device_info *cdi)
 		}
 	}
 
-	cd_dbg(CD_OPEN, "all seems well, opening the devicen");
+	cd_dbg(CD_OPEN, "all seems well, opening the device\n");
 
 	/* all seems well, we can open the device */
 	ret = cdo->open(cdi, 0); /* open for data */
diff --git a/drivers/clk/imx/clk-imx8mp-audiomix.c b/drivers/clk/imx/clk-imx8mp-audiomix.c
index b2cb157703c5..c409fc7e0618 100644
--- a/drivers/clk/imx/clk-imx8mp-audiomix.c
+++ b/drivers/clk/imx/clk-imx8mp-audiomix.c
@@ -278,7 +278,8 @@ static int clk_imx8mp_audiomix_reset_controller_register(struct device *dev,
 
 #else /* !CONFIG_RESET_CONTROLLER */
 
-static int clk_imx8mp_audiomix_reset_controller_register(struct clk_imx8mp_audiomix_priv *priv)
+static int clk_imx8mp_audiomix_reset_controller_register(struct device *dev,
+							 struct clk_imx8mp_audiomix_priv *priv)
 {
 	return 0;
 }
diff --git a/drivers/clk/thead/clk-th1520-ap.c b/drivers/clk/thead/clk-th1520-ap.c
index 17e32ae08720..1015fab95251 100644
--- a/drivers/clk/thead/clk-th1520-ap.c
+++ b/drivers/clk/thead/clk-th1520-ap.c
@@ -779,6 +779,13 @@ static struct ccu_div dpu1_clk = {
 	},
 };
 
+static CLK_FIXED_FACTOR_HW(emmc_sdio_ref_clk, "emmc-sdio-ref",
+			   &video_pll_clk.common.hw, 4, 1, 0);
+
+static const struct clk_parent_data emmc_sdio_ref_clk_pd[] = {
+	{ .hw = &emmc_sdio_ref_clk.hw },
+};
+
 static CCU_GATE(CLK_BROM, brom_clk, "brom", ahb2_cpusys_hclk_pd, 0x100, BIT(4), 0);
 static CCU_GATE(CLK_BMU, bmu_clk, "bmu", axi4_cpusys2_aclk_pd, 0x100, BIT(5), 0);
 static CCU_GATE(CLK_AON2CPU_A2X, aon2cpu_a2x_clk, "aon2cpu-a2x", axi4_cpusys2_aclk_pd,
@@ -798,7 +805,7 @@ static CCU_GATE(CLK_PERISYS_APB4_HCLK, perisys_apb4_hclk, "perisys-apb4-hclk", p
 		0x150, BIT(12), 0);
 static CCU_GATE(CLK_NPU_AXI, npu_axi_clk, "npu-axi", axi_aclk_pd, 0x1c8, BIT(5), 0);
 static CCU_GATE(CLK_CPU2VP, cpu2vp_clk, "cpu2vp", axi_aclk_pd, 0x1e0, BIT(13), 0);
-static CCU_GATE(CLK_EMMC_SDIO, emmc_sdio_clk, "emmc-sdio", video_pll_clk_pd, 0x204, BIT(30), 0);
+static CCU_GATE(CLK_EMMC_SDIO, emmc_sdio_clk, "emmc-sdio", emmc_sdio_ref_clk_pd, 0x204, BIT(30), 0);
 static CCU_GATE(CLK_GMAC1, gmac1_clk, "gmac1", gmac_pll_clk_pd, 0x204, BIT(26), 0);
 static CCU_GATE(CLK_PADCTRL1, padctrl1_clk, "padctrl1", perisys_apb_pclk_pd, 0x204, BIT(24), 0);
 static CCU_GATE(CLK_DSMART, dsmart_clk, "dsmart", perisys_apb_pclk_pd, 0x204, BIT(23), 0);
@@ -1059,6 +1066,10 @@ static int th1520_clk_probe(struct platform_device *pdev)
 		return ret;
 	priv->hws[CLK_PLL_GMAC_100M] = &gmac_pll_clk_100m.hw;
 
+	ret = devm_clk_hw_register(dev, &emmc_sdio_ref_clk.hw);
+	if (ret)
+		return ret;
+
 	ret = devm_of_clk_add_hw_provider(dev, of_clk_hw_onecell_get, priv);
 	if (ret)
 		return ret;
diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c
index ec5db1478b2f..18ae45dcbfb2 100644
--- a/drivers/md/dm-ebs-target.c
+++ b/drivers/md/dm-ebs-target.c
@@ -442,7 +442,7 @@ static int ebs_iterate_devices(struct dm_target *ti,
 static struct target_type ebs_target = {
 	.name		 = "ebs",
 	.version	 = {1, 0, 1},
-	.features	 = DM_TARGET_PASSES_INTEGRITY,
+	.features	 = 0,
 	.module		 = THIS_MODULE,
 	.ctr		 = ebs_ctr,
 	.dtr		 = ebs_dtr,
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index bf0f9dddd146..05cf4e3f2bbe 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -2332,10 +2332,9 @@ static struct thin_c *get_first_thin(struct pool *pool)
 	struct thin_c *tc = NULL;
 
 	rcu_read_lock();
-	if (!list_empty(&pool->active_thins)) {
-		tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
+	tc = list_first_or_null_rcu(&pool->active_thins, struct thin_c, list);
+	if (tc)
 		thin_get(tc);
-	}
 	rcu_read_unlock();
 
 	return tc;
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 62b1a44b8dd2..e61855da6461 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -40,35 +40,23 @@ static inline u64 fec_interleave(struct dm_verity *v, u64 offset)
 }
 
 /*
- * Decode an RS block using Reed-Solomon.
- */
-static int fec_decode_rs8(struct dm_verity *v, struct dm_verity_fec_io *fio,
-			  u8 *data, u8 *fec, int neras)
-{
-	int i;
-	uint16_t par[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN];
-
-	for (i = 0; i < v->fec->roots; i++)
-		par[i] = fec[i];
-
-	return decode_rs8(fio->rs, data, par, v->fec->rsn, NULL, neras,
-			  fio->erasures, 0, NULL);
-}
-
-/*
  * Read error-correcting codes for the requested RS block. Returns a pointer
  * to the data block. Caller is responsible for releasing buf.
  */
 static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index,
-			   unsigned int *offset, struct dm_buffer **buf,
-			   unsigned short ioprio)
+			   unsigned int *offset, unsigned int par_buf_offset,
+			   struct dm_buffer **buf, unsigned short ioprio)
 {
 	u64 position, block, rem;
 	u8 *res;
 
+	/* We have already part of parity bytes read, skip to the next block */
+	if (par_buf_offset)
+		index++;
+
 	position = (index + rsb) * v->fec->roots;
 	block = div64_u64_rem(position, v->fec->io_size, &rem);
-	*offset = (unsigned int)rem;
+	*offset = par_buf_offset ? 0 : (unsigned int)rem;
 
 	res = dm_bufio_read_with_ioprio(v->fec->bufio, block, buf, ioprio);
 	if (IS_ERR(res)) {
@@ -128,11 +116,13 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io,
 {
 	int r, corrected = 0, res;
 	struct dm_buffer *buf;
-	unsigned int n, i, offset;
+	unsigned int n, i, j, offset, par_buf_offset = 0;
+	uint16_t par_buf[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN];
 	u8 *par, *block;
 	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
 
-	par = fec_read_parity(v, rsb, block_offset, &offset, &buf, bio_prio(bio));
+	par = fec_read_parity(v, rsb, block_offset, &offset,
+			      par_buf_offset, &buf, bio_prio(bio));
 	if (IS_ERR(par))
 		return PTR_ERR(par);
 
@@ -142,7 +132,11 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io,
 	 */
 	fec_for_each_buffer_rs_block(fio, n, i) {
 		block = fec_buffer_rs_block(v, fio, n, i);
-		res = fec_decode_rs8(v, fio, block, &par[offset], neras);
+		for (j = 0; j < v->fec->roots - par_buf_offset; j++)
+			par_buf[par_buf_offset + j] = par[offset + j];
+		/* Decode an RS block using Reed-Solomon */
+		res = decode_rs8(fio->rs, block, par_buf, v->fec->rsn,
+				 NULL, neras, fio->erasures, 0, NULL);
 		if (res < 0) {
 			r = res;
 			goto error;
@@ -155,12 +149,22 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io,
 		if (block_offset >= 1 << v->data_dev_block_bits)
 			goto done;
 
-		/* read the next block when we run out of parity bytes */
-		offset += v->fec->roots;
+		/* Read the next block when we run out of parity bytes */
+		offset += (v->fec->roots - par_buf_offset);
+		/* Check if parity bytes are split between blocks */
+		if (offset < v->fec->io_size && (offset + v->fec->roots) > v->fec->io_size) {
+			par_buf_offset = v->fec->io_size - offset;
+			for (j = 0; j < par_buf_offset; j++)
+				par_buf[j] = par[offset + j];
+			offset += par_buf_offset;
+		} else
+			par_buf_offset = 0;
+
 		if (offset >= v->fec->io_size) {
 			dm_bufio_release(buf);
 
-			par = fec_read_parity(v, rsb, block_offset, &offset, &buf, bio_prio(bio));
+			par = fec_read_parity(v, rsb, block_offset, &offset,
+					      par_buf_offset, &buf, bio_prio(bio));
 			if (IS_ERR(par))
 				return PTR_ERR(par);
 		}
@@ -724,10 +728,7 @@ int verity_fec_ctr(struct dm_verity *v)
 		return -E2BIG;
 	}
 
-	if ((f->roots << SECTOR_SHIFT) & ((1 << v->data_dev_block_bits) - 1))
-		f->io_size = 1 << v->data_dev_block_bits;
-	else
-		f->io_size = v->fec->roots << SECTOR_SHIFT;
+	f->io_size = 1 << v->data_dev_block_bits;
 
 	f->bufio = dm_bufio_client_create(f->dev->bdev,
 					  f->io_size,
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index 157c9bd2fed7..8f8792e55806 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -917,23 +917,27 @@ static int load_ablock(struct dm_array_cursor *c)
 	if (c->block)
 		unlock_ablock(c->info, c->block);
 
-	c->block = NULL;
-	c->ab = NULL;
 	c->index = 0;
 
 	r = dm_btree_cursor_get_value(&c->cursor, &key, &value_le);
 	if (r) {
 		DMERR("dm_btree_cursor_get_value failed");
-		dm_btree_cursor_end(&c->cursor);
+		goto out;
 
 	} else {
 		r = get_ablock(c->info, le64_to_cpu(value_le), &c->block, &c->ab);
 		if (r) {
 			DMERR("get_ablock failed");
-			dm_btree_cursor_end(&c->cursor);
+			goto out;
 		}
 	}
 
+	return 0;
+
+out:
+	dm_btree_cursor_end(&c->cursor);
+	c->block = NULL;
+	c->ab = NULL;
 	return r;
 }
 
@@ -956,10 +960,10 @@ EXPORT_SYMBOL_GPL(dm_array_cursor_begin);
 
 void dm_array_cursor_end(struct dm_array_cursor *c)
 {
-	if (c->block) {
+	if (c->block)
 		unlock_ablock(c->info, c->block);
-		dm_btree_cursor_end(&c->cursor);
-	}
+
+	dm_btree_cursor_end(&c->cursor);
 }
 EXPORT_SYMBOL_GPL(dm_array_cursor_end);
 
@@ -999,6 +1003,7 @@ int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count)
 		}
 
 		count -= remaining;
+		c->index += (remaining - 1);
 		r = dm_array_cursor_next(c);
 
 	} while (!r);
diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c
index 681643ab3780..5ec3170b896a 100644
--- a/drivers/net/can/dev/dev.c
+++ b/drivers/net/can/dev/dev.c
@@ -85,8 +85,6 @@ const char *can_get_state_str(const enum can_state state)
 	default:
 		return "<unknown>";
 	}
-
-	return "<unknown>";
 }
 EXPORT_SYMBOL_GPL(can_get_state_str);
 
diff --git a/drivers/net/can/grcan.c b/drivers/net/can/grcan.c
index cdf0ec9fa7f3..21a61b86f67d 100644
--- a/drivers/net/can/grcan.c
+++ b/drivers/net/can/grcan.c
@@ -1073,9 +1073,10 @@ static int grcan_open(struct net_device *dev)
 	if (err)
 		goto exit_close_candev;
 
+	napi_enable(&priv->napi);
+
 	spin_lock_irqsave(&priv->lock, flags);
 
-	napi_enable(&priv->napi);
 	grcan_start(dev);
 	if (!(priv->can.ctrlmode & CAN_CTRLMODE_LISTENONLY))
 		netif_start_queue(dev);
diff --git a/drivers/net/can/kvaser_pciefd.c b/drivers/net/can/kvaser_pciefd.c
index fee012b57f33..fa04a7ced02b 100644
--- a/drivers/net/can/kvaser_pciefd.c
+++ b/drivers/net/can/kvaser_pciefd.c
@@ -999,7 +999,8 @@ static int kvaser_pciefd_setup_can_ctrls(struct kvaser_pciefd *pcie)
 		can->can.ctrlmode_supported = CAN_CTRLMODE_LISTENONLY |
 					      CAN_CTRLMODE_FD |
 					      CAN_CTRLMODE_FD_NON_ISO |
-					      CAN_CTRLMODE_CC_LEN8_DLC;
+					      CAN_CTRLMODE_CC_LEN8_DLC |
+					      CAN_CTRLMODE_BERR_REPORTING;
 
 		status = ioread32(can->reg_base + KVASER_PCIEFD_KCAN_STAT_REG);
 		if (!(status & KVASER_PCIEFD_KCAN_STAT_FD)) {
@@ -1234,11 +1235,15 @@ static int kvaser_pciefd_handle_data_packet(struct kvaser_pciefd *pcie,
 }
 
 static void kvaser_pciefd_change_state(struct kvaser_pciefd_can *can,
+				       const struct can_berr_counter *bec,
 				       struct can_frame *cf,
 				       enum can_state new_state,
 				       enum can_state tx_state,
 				       enum can_state rx_state)
 {
+	enum can_state old_state;
+
+	old_state = can->can.state;
 	can_change_state(can->can.dev, cf, tx_state, rx_state);
 
 	if (new_state == CAN_STATE_BUS_OFF) {
@@ -1254,6 +1259,18 @@ static void kvaser_pciefd_change_state(struct kvaser_pciefd_can *can,
 			can_bus_off(ndev);
 		}
 	}
+	if (old_state == CAN_STATE_BUS_OFF &&
+	    new_state == CAN_STATE_ERROR_ACTIVE &&
+	    can->can.restart_ms) {
+		can->can.can_stats.restarts++;
+		if (cf)
+			cf->can_id |= CAN_ERR_RESTARTED;
+	}
+	if (cf && new_state != CAN_STATE_BUS_OFF) {
+		cf->can_id |= CAN_ERR_CNT;
+		cf->data[6] = bec->txerr;
+		cf->data[7] = bec->rxerr;
+	}
 }
 
 static void kvaser_pciefd_packet_to_state(struct kvaser_pciefd_rx_packet *p,
@@ -1288,7 +1305,7 @@ static int kvaser_pciefd_rx_error_frame(struct kvaser_pciefd_can *can,
 	struct can_berr_counter bec;
 	enum can_state old_state, new_state, tx_state, rx_state;
 	struct net_device *ndev = can->can.dev;
-	struct sk_buff *skb;
+	struct sk_buff *skb = NULL;
 	struct can_frame *cf = NULL;
 
 	old_state = can->can.state;
@@ -1297,16 +1314,10 @@ static int kvaser_pciefd_rx_error_frame(struct kvaser_pciefd_can *can,
 	bec.rxerr = FIELD_GET(KVASER_PCIEFD_SPACK_RXERR_MASK, p->header[0]);
 
 	kvaser_pciefd_packet_to_state(p, &bec, &new_state, &tx_state, &rx_state);
-	skb = alloc_can_err_skb(ndev, &cf);
+	if (can->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING)
+		skb = alloc_can_err_skb(ndev, &cf);
 	if (new_state != old_state) {
-		kvaser_pciefd_change_state(can, cf, new_state, tx_state, rx_state);
-		if (old_state == CAN_STATE_BUS_OFF &&
-		    new_state == CAN_STATE_ERROR_ACTIVE &&
-		    can->can.restart_ms) {
-			can->can.can_stats.restarts++;
-			if (skb)
-				cf->can_id |= CAN_ERR_RESTARTED;
-		}
+		kvaser_pciefd_change_state(can, &bec, cf, new_state, tx_state, rx_state);
 	}
 
 	can->err_rep_cnt++;
@@ -1319,18 +1330,19 @@ static int kvaser_pciefd_rx_error_frame(struct kvaser_pciefd_can *can,
 	can->bec.txerr = bec.txerr;
 	can->bec.rxerr = bec.rxerr;
 
-	if (!skb) {
-		ndev->stats.rx_dropped++;
-		return -ENOMEM;
+	if (can->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING) {
+		if (!skb) {
+			netdev_warn(ndev, "No memory left for err_skb\n");
+			ndev->stats.rx_dropped++;
+			return -ENOMEM;
+		}
+		kvaser_pciefd_set_skb_timestamp(can->kv_pcie, skb, p->timestamp);
+		cf->can_id |= CAN_ERR_BUSERROR | CAN_ERR_CNT;
+		cf->data[6] = bec.txerr;
+		cf->data[7] = bec.rxerr;
+		netif_rx(skb);
 	}
 
-	kvaser_pciefd_set_skb_timestamp(can->kv_pcie, skb, p->timestamp);
-	cf->can_id |= CAN_ERR_BUSERROR | CAN_ERR_CNT;
-	cf->data[6] = bec.txerr;
-	cf->data[7] = bec.rxerr;
-
-	netif_rx(skb);
-
 	return 0;
 }
 
@@ -1359,6 +1371,7 @@ static int kvaser_pciefd_handle_status_resp(struct kvaser_pciefd_can *can,
 {
 	struct can_berr_counter bec;
 	enum can_state old_state, new_state, tx_state, rx_state;
+	int ret = 0;
 
 	old_state = can->can.state;
 
@@ -1372,25 +1385,15 @@ static int kvaser_pciefd_handle_status_resp(struct kvaser_pciefd_can *can,
 		struct can_frame *cf;
 
 		skb = alloc_can_err_skb(ndev, &cf);
-		if (!skb) {
+		kvaser_pciefd_change_state(can, &bec, cf, new_state, tx_state, rx_state);
+		if (skb) {
+			kvaser_pciefd_set_skb_timestamp(can->kv_pcie, skb, p->timestamp);
+			netif_rx(skb);
+		} else {
 			ndev->stats.rx_dropped++;
-			return -ENOMEM;
+			netdev_warn(ndev, "No memory left for err_skb\n");
+			ret = -ENOMEM;
 		}
-
-		kvaser_pciefd_change_state(can, cf, new_state, tx_state, rx_state);
-		if (old_state == CAN_STATE_BUS_OFF &&
-		    new_state == CAN_STATE_ERROR_ACTIVE &&
-		    can->can.restart_ms) {
-			can->can.can_stats.restarts++;
-			cf->can_id |= CAN_ERR_RESTARTED;
-		}
-
-		kvaser_pciefd_set_skb_timestamp(can->kv_pcie, skb, p->timestamp);
-
-		cf->data[6] = bec.txerr;
-		cf->data[7] = bec.rxerr;
-
-		netif_rx(skb);
 	}
 	can->bec.txerr = bec.txerr;
 	can->bec.rxerr = bec.rxerr;
@@ -1398,7 +1401,7 @@ static int kvaser_pciefd_handle_status_resp(struct kvaser_pciefd_can *can,
 	if (bec.txerr || bec.rxerr)
 		mod_timer(&can->bec_poll_timer, KVASER_PCIEFD_BEC_POLL_FREQ);
 
-	return 0;
+	return ret;
 }
 
 static int kvaser_pciefd_handle_status_packet(struct kvaser_pciefd *pcie,
diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index 97cd8bbf2e32..d025d4163fd1 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -1785,6 +1785,13 @@ static void m_can_stop(struct net_device *dev)
 
 	/* set the state as STOPPED */
 	cdev->can.state = CAN_STATE_STOPPED;
+
+	if (cdev->ops->deinit) {
+		ret = cdev->ops->deinit(cdev);
+		if (ret)
+			netdev_err(dev, "failed to deinitialize: %pe\n",
+				   ERR_PTR(ret));
+	}
 }
 
 static int m_can_close(struct net_device *dev)
@@ -2466,6 +2473,7 @@ int m_can_class_suspend(struct device *dev)
 {
 	struct m_can_classdev *cdev = dev_get_drvdata(dev);
 	struct net_device *ndev = cdev->net;
+	int ret = 0;
 
 	if (netif_running(ndev)) {
 		netif_stop_queue(ndev);
@@ -2478,6 +2486,9 @@ int m_can_class_suspend(struct device *dev)
 		if (cdev->pm_wake_source) {
 			hrtimer_cancel(&cdev->hrtimer);
 			m_can_write(cdev, M_CAN_IE, IR_RF0N);
+
+			if (cdev->ops->deinit)
+				ret = cdev->ops->deinit(cdev);
 		} else {
 			m_can_stop(ndev);
 		}
@@ -2489,7 +2500,7 @@ int m_can_class_suspend(struct device *dev)
 
 	cdev->can.state = CAN_STATE_SLEEPING;
 
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(m_can_class_suspend);
 
@@ -2497,14 +2508,13 @@ int m_can_class_resume(struct device *dev)
 {
 	struct m_can_classdev *cdev = dev_get_drvdata(dev);
 	struct net_device *ndev = cdev->net;
+	int ret = 0;
 
 	pinctrl_pm_select_default_state(dev);
 
 	cdev->can.state = CAN_STATE_ERROR_ACTIVE;
 
 	if (netif_running(ndev)) {
-		int ret;
-
 		ret = m_can_clk_start(cdev);
 		if (ret)
 			return ret;
@@ -2517,6 +2527,10 @@ int m_can_class_resume(struct device *dev)
 			 * again.
 			 */
 			cdev->active_interrupts |= IR_RF0N | IR_TEFN;
+
+			if (cdev->ops->init)
+				ret = cdev->ops->init(cdev);
+
 			m_can_write(cdev, M_CAN_IE, cdev->active_interrupts);
 		} else {
 			ret  = m_can_start(ndev);
@@ -2530,7 +2544,7 @@ int m_can_class_resume(struct device *dev)
 		netif_start_queue(ndev);
 	}
 
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(m_can_class_resume);
 
diff --git a/drivers/net/can/m_can/m_can.h b/drivers/net/can/m_can/m_can.h
index ef39e8e527ab..bd4746c63af3 100644
--- a/drivers/net/can/m_can/m_can.h
+++ b/drivers/net/can/m_can/m_can.h
@@ -68,6 +68,7 @@ struct m_can_ops {
 	int (*write_fifo)(struct m_can_classdev *cdev, int addr_offset,
 			  const void *val, size_t val_count);
 	int (*init)(struct m_can_classdev *cdev);
+	int (*deinit)(struct m_can_classdev *cdev);
 };
 
 struct m_can_tx_op {
diff --git a/drivers/net/can/m_can/tcan4x5x-core.c b/drivers/net/can/m_can/tcan4x5x-core.c
index 2f73bf3abad8..e5c162f8c589 100644
--- a/drivers/net/can/m_can/tcan4x5x-core.c
+++ b/drivers/net/can/m_can/tcan4x5x-core.c
@@ -92,6 +92,8 @@
 #define TCAN4X5X_MODE_STANDBY BIT(6)
 #define TCAN4X5X_MODE_NORMAL BIT(7)
 
+#define TCAN4X5X_NWKRQ_VOLTAGE_VIO BIT(19)
+
 #define TCAN4X5X_DISABLE_WAKE_MSK	(BIT(31) | BIT(30))
 #define TCAN4X5X_DISABLE_INH_MSK	BIT(9)
 
@@ -267,9 +269,24 @@ static int tcan4x5x_init(struct m_can_classdev *cdev)
 	if (ret)
 		return ret;
 
+	if (tcan4x5x->nwkrq_voltage_vio) {
+		ret = regmap_set_bits(tcan4x5x->regmap, TCAN4X5X_CONFIG,
+				      TCAN4X5X_NWKRQ_VOLTAGE_VIO);
+		if (ret)
+			return ret;
+	}
+
 	return ret;
 }
 
+static int tcan4x5x_deinit(struct m_can_classdev *cdev)
+{
+	struct tcan4x5x_priv *tcan4x5x = cdev_to_priv(cdev);
+
+	return regmap_update_bits(tcan4x5x->regmap, TCAN4X5X_CONFIG,
+				  TCAN4X5X_MODE_SEL_MASK, TCAN4X5X_MODE_STANDBY);
+};
+
 static int tcan4x5x_disable_wake(struct m_can_classdev *cdev)
 {
 	struct tcan4x5x_priv *tcan4x5x = cdev_to_priv(cdev);
@@ -318,6 +335,14 @@ static const struct tcan4x5x_version_info
 	return &tcan4x5x_versions[TCAN4X5X];
 }
 
+static void tcan4x5x_get_dt_data(struct m_can_classdev *cdev)
+{
+	struct tcan4x5x_priv *tcan4x5x = cdev_to_priv(cdev);
+
+	tcan4x5x->nwkrq_voltage_vio =
+		of_property_read_bool(cdev->dev->of_node, "ti,nwkrq-voltage-vio");
+}
+
 static int tcan4x5x_get_gpios(struct m_can_classdev *cdev,
 			      const struct tcan4x5x_version_info *version_info)
 {
@@ -359,6 +384,7 @@ static int tcan4x5x_get_gpios(struct m_can_classdev *cdev,
 
 static const struct m_can_ops tcan4x5x_ops = {
 	.init = tcan4x5x_init,
+	.deinit = tcan4x5x_deinit,
 	.read_reg = tcan4x5x_read_reg,
 	.write_reg = tcan4x5x_write_reg,
 	.write_fifo = tcan4x5x_write_fifo,
@@ -392,7 +418,7 @@ static int tcan4x5x_can_probe(struct spi_device *spi)
 		priv->power = NULL;
 	}
 
-	m_can_class_get_clocks(mcan_class);
+	mcan_class->cclk = devm_clk_get(mcan_class->dev, "cclk");
 	if (IS_ERR(mcan_class->cclk)) {
 		dev_err(&spi->dev, "no CAN clock source defined\n");
 		freq = TCAN4X5X_EXT_CLK_DEF;
@@ -453,6 +479,8 @@ static int tcan4x5x_can_probe(struct spi_device *spi)
 		goto out_power;
 	}
 
+	tcan4x5x_get_dt_data(mcan_class);
+
 	tcan4x5x_check_wake(priv);
 
 	ret = tcan4x5x_write_tcan_reg(mcan_class, TCAN4X5X_INT_EN, 0);
diff --git a/drivers/net/can/m_can/tcan4x5x.h b/drivers/net/can/m_can/tcan4x5x.h
index e62c030d3e1e..203399d5e8cc 100644
--- a/drivers/net/can/m_can/tcan4x5x.h
+++ b/drivers/net/can/m_can/tcan4x5x.h
@@ -42,6 +42,8 @@ struct tcan4x5x_priv {
 
 	struct tcan4x5x_map_buf map_buf_rx;
 	struct tcan4x5x_map_buf map_buf_tx;
+
+	bool nwkrq_voltage_vio;
 };
 
 static inline void
diff --git a/drivers/net/can/sun4i_can.c b/drivers/net/can/sun4i_can.c
index 4311c1f0eafd..6fcb301ef611 100644
--- a/drivers/net/can/sun4i_can.c
+++ b/drivers/net/can/sun4i_can.c
@@ -570,7 +570,7 @@ static int sun4i_can_err(struct net_device *dev, u8 isrc, u8 status)
 		else
 			state = CAN_STATE_ERROR_ACTIVE;
 	}
-	if (skb && state != CAN_STATE_BUS_OFF) {
+	if (likely(skb) && state != CAN_STATE_BUS_OFF) {
 		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = txerr;
 		cf->data[7] = rxerr;
diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c
index 7d12776ab63e..dcb0bcbe0565 100644
--- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c
+++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c
@@ -818,7 +818,8 @@ static int kvaser_usb_init_one(struct kvaser_usb *dev, int channel)
 	init_completion(&priv->stop_comp);
 	init_completion(&priv->flush_comp);
 	init_completion(&priv->get_busparams_comp);
-	priv->can.ctrlmode_supported = CAN_CTRLMODE_CC_LEN8_DLC;
+	priv->can.ctrlmode_supported = CAN_CTRLMODE_CC_LEN8_DLC |
+				       CAN_CTRLMODE_BERR_REPORTING;
 
 	priv->dev = dev;
 	priv->netdev = netdev;
diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
index 3764b263add3..8e88b5917796 100644
--- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
+++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
@@ -926,6 +926,42 @@ kvaser_usb_hydra_bus_status_to_can_state(const struct kvaser_usb_net_priv *priv,
 	}
 }
 
+static void kvaser_usb_hydra_change_state(struct kvaser_usb_net_priv *priv,
+					  const struct can_berr_counter *bec,
+					  struct can_frame *cf,
+					  enum can_state new_state)
+{
+	struct net_device *netdev = priv->netdev;
+	enum can_state old_state = priv->can.state;
+	enum can_state tx_state, rx_state;
+
+	tx_state = (bec->txerr >= bec->rxerr) ?
+				new_state : CAN_STATE_ERROR_ACTIVE;
+	rx_state = (bec->txerr <= bec->rxerr) ?
+				new_state : CAN_STATE_ERROR_ACTIVE;
+	can_change_state(netdev, cf, tx_state, rx_state);
+
+	if (new_state == CAN_STATE_BUS_OFF && old_state < CAN_STATE_BUS_OFF) {
+		if (priv->can.restart_ms == 0)
+			kvaser_usb_hydra_send_simple_cmd_async(priv, CMD_STOP_CHIP_REQ);
+
+		can_bus_off(netdev);
+	}
+
+	if (priv->can.restart_ms &&
+	    old_state >= CAN_STATE_BUS_OFF &&
+	    new_state < CAN_STATE_BUS_OFF) {
+		priv->can.can_stats.restarts++;
+		if (cf)
+			cf->can_id |= CAN_ERR_RESTARTED;
+	}
+	if (cf && new_state != CAN_STATE_BUS_OFF) {
+		cf->can_id |= CAN_ERR_CNT;
+		cf->data[6] = bec->txerr;
+		cf->data[7] = bec->rxerr;
+	}
+}
+
 static void kvaser_usb_hydra_update_state(struct kvaser_usb_net_priv *priv,
 					  u8 bus_status,
 					  const struct can_berr_counter *bec)
@@ -951,41 +987,11 @@ static void kvaser_usb_hydra_update_state(struct kvaser_usb_net_priv *priv,
 		return;
 
 	skb = alloc_can_err_skb(netdev, &cf);
-	if (skb) {
-		enum can_state tx_state, rx_state;
-
-		tx_state = (bec->txerr >= bec->rxerr) ?
-					new_state : CAN_STATE_ERROR_ACTIVE;
-		rx_state = (bec->txerr <= bec->rxerr) ?
-					new_state : CAN_STATE_ERROR_ACTIVE;
-		can_change_state(netdev, cf, tx_state, rx_state);
-	}
-
-	if (new_state == CAN_STATE_BUS_OFF && old_state < CAN_STATE_BUS_OFF) {
-		if (!priv->can.restart_ms)
-			kvaser_usb_hydra_send_simple_cmd_async
-						(priv, CMD_STOP_CHIP_REQ);
-
-		can_bus_off(netdev);
-	}
-
-	if (!skb) {
+	kvaser_usb_hydra_change_state(priv, bec, cf, new_state);
+	if (skb)
+		netif_rx(skb);
+	else
 		netdev_warn(netdev, "No memory left for err_skb\n");
-		return;
-	}
-
-	if (priv->can.restart_ms &&
-	    old_state >= CAN_STATE_BUS_OFF &&
-	    new_state < CAN_STATE_BUS_OFF)
-		priv->can.can_stats.restarts++;
-
-	if (new_state != CAN_STATE_BUS_OFF) {
-		cf->can_id |= CAN_ERR_CNT;
-		cf->data[6] = bec->txerr;
-		cf->data[7] = bec->rxerr;
-	}
-
-	netif_rx(skb);
 }
 
 static void kvaser_usb_hydra_state_event(const struct kvaser_usb *dev,
@@ -1078,9 +1084,8 @@ kvaser_usb_hydra_error_frame(struct kvaser_usb_net_priv *priv,
 {
 	struct net_device *netdev = priv->netdev;
 	struct net_device_stats *stats = &netdev->stats;
-	struct can_frame *cf;
-	struct sk_buff *skb;
-	struct skb_shared_hwtstamps *shhwtstamps;
+	struct can_frame *cf = NULL;
+	struct sk_buff *skb = NULL;
 	struct can_berr_counter bec;
 	enum can_state new_state, old_state;
 	u8 bus_status;
@@ -1096,52 +1101,26 @@ kvaser_usb_hydra_error_frame(struct kvaser_usb_net_priv *priv,
 	kvaser_usb_hydra_bus_status_to_can_state(priv, bus_status, &bec,
 						 &new_state);
 
-	skb = alloc_can_err_skb(netdev, &cf);
+	if (priv->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING)
+		skb = alloc_can_err_skb(netdev, &cf);
+	if (new_state != old_state)
+		kvaser_usb_hydra_change_state(priv, &bec, cf, new_state);
 
-	if (new_state != old_state) {
+	if (priv->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING) {
 		if (skb) {
-			enum can_state tx_state, rx_state;
-
-			tx_state = (bec.txerr >= bec.rxerr) ?
-					new_state : CAN_STATE_ERROR_ACTIVE;
-			rx_state = (bec.txerr <= bec.rxerr) ?
-					new_state : CAN_STATE_ERROR_ACTIVE;
-
-			can_change_state(netdev, cf, tx_state, rx_state);
-
-			if (priv->can.restart_ms &&
-			    old_state >= CAN_STATE_BUS_OFF &&
-			    new_state < CAN_STATE_BUS_OFF)
-				cf->can_id |= CAN_ERR_RESTARTED;
-		}
-
-		if (new_state == CAN_STATE_BUS_OFF) {
-			if (!priv->can.restart_ms)
-				kvaser_usb_hydra_send_simple_cmd_async
-						(priv, CMD_STOP_CHIP_REQ);
+			struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
 
-			can_bus_off(netdev);
+			shhwtstamps->hwtstamp = hwtstamp;
+			cf->can_id |= CAN_ERR_BUSERROR | CAN_ERR_CNT;
+			cf->data[6] = bec.txerr;
+			cf->data[7] = bec.rxerr;
+			netif_rx(skb);
+		} else {
+			stats->rx_dropped++;
+			netdev_warn(netdev, "No memory left for err_skb\n");
 		}
 	}
 
-	if (!skb) {
-		stats->rx_dropped++;
-		netdev_warn(netdev, "No memory left for err_skb\n");
-		return;
-	}
-
-	shhwtstamps = skb_hwtstamps(skb);
-	shhwtstamps->hwtstamp = hwtstamp;
-
-	cf->can_id |= CAN_ERR_BUSERROR;
-	if (new_state != CAN_STATE_BUS_OFF) {
-		cf->can_id |= CAN_ERR_CNT;
-		cf->data[6] = bec.txerr;
-		cf->data[7] = bec.rxerr;
-	}
-
-	netif_rx(skb);
-
 	priv->bec.txerr = bec.txerr;
 	priv->bec.rxerr = bec.rxerr;
 }
diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
index 6b9122ab1464..6a45adcc45bd 100644
--- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
+++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
@@ -1120,10 +1120,8 @@ kvaser_usb_leaf_rx_error_update_can_state(struct kvaser_usb_net_priv *priv,
 static void kvaser_usb_leaf_rx_error(const struct kvaser_usb *dev,
 				     const struct kvaser_usb_err_summary *es)
 {
-	struct can_frame *cf;
-	struct can_frame tmp_cf = { .can_id = CAN_ERR_FLAG,
-				    .len = CAN_ERR_DLC };
-	struct sk_buff *skb;
+	struct can_frame *cf = NULL;
+	struct sk_buff *skb = NULL;
 	struct net_device_stats *stats;
 	struct kvaser_usb_net_priv *priv;
 	struct kvaser_usb_net_leaf_priv *leaf;
@@ -1143,18 +1141,10 @@ static void kvaser_usb_leaf_rx_error(const struct kvaser_usb *dev,
 	if (!netif_running(priv->netdev))
 		return;
 
-	/* Update all of the CAN interface's state and error counters before
-	 * trying any memory allocation that can actually fail with -ENOMEM.
-	 *
-	 * We send a temporary stack-allocated error CAN frame to
-	 * can_change_state() for the very same reason.
-	 *
-	 * TODO: Split can_change_state() responsibility between updating the
-	 * CAN interface's state and counters, and the setting up of CAN error
-	 * frame ID and data to userspace. Remove stack allocation afterwards.
-	 */
 	old_state = priv->can.state;
-	kvaser_usb_leaf_rx_error_update_can_state(priv, es, &tmp_cf);
+	if (priv->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING)
+		skb = alloc_can_err_skb(priv->netdev, &cf);
+	kvaser_usb_leaf_rx_error_update_can_state(priv, es, cf);
 	new_state = priv->can.state;
 
 	/* If there are errors, request status updates periodically as we do
@@ -1168,13 +1158,6 @@ static void kvaser_usb_leaf_rx_error(const struct kvaser_usb *dev,
 		schedule_delayed_work(&leaf->chip_state_req_work,
 				      msecs_to_jiffies(500));
 
-	skb = alloc_can_err_skb(priv->netdev, &cf);
-	if (!skb) {
-		stats->rx_dropped++;
-		return;
-	}
-	memcpy(cf, &tmp_cf, sizeof(*cf));
-
 	if (new_state != old_state) {
 		if (es->status &
 		    (M16C_STATE_BUS_OFF | M16C_STATE_BUS_RESET)) {
@@ -1187,11 +1170,20 @@ static void kvaser_usb_leaf_rx_error(const struct kvaser_usb *dev,
 		if (priv->can.restart_ms &&
 		    old_state == CAN_STATE_BUS_OFF &&
 		    new_state < CAN_STATE_BUS_OFF) {
-			cf->can_id |= CAN_ERR_RESTARTED;
+			if (cf)
+				cf->can_id |= CAN_ERR_RESTARTED;
 			netif_carrier_on(priv->netdev);
 		}
 	}
 
+	if (!skb) {
+		if (priv->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING) {
+			stats->rx_dropped++;
+			netdev_warn(priv->netdev, "No memory left for err_skb\n");
+		}
+		return;
+	}
+
 	switch (dev->driver_info->family) {
 	case KVASER_LEAF:
 		if (es->leaf.error_factor) {
diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 0561b60f668f..79dc77835681 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -2232,12 +2232,6 @@ bool b53_support_eee(struct dsa_switch *ds, int port)
 }
 EXPORT_SYMBOL(b53_support_eee);
 
-int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e)
-{
-	return 0;
-}
-EXPORT_SYMBOL(b53_get_mac_eee);
-
 int b53_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e)
 {
 	struct b53_device *dev = ds->priv;
@@ -2299,7 +2293,6 @@ static const struct dsa_switch_ops b53_switch_ops = {
 	.port_enable		= b53_enable_port,
 	.port_disable		= b53_disable_port,
 	.support_eee		= b53_support_eee,
-	.get_mac_eee		= b53_get_mac_eee,
 	.set_mac_eee		= b53_set_mac_eee,
 	.port_bridge_join	= b53_br_join,
 	.port_bridge_leave	= b53_br_leave,
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index 99e5cfc98ae8..9e9b5bc0c5d6 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -385,7 +385,6 @@ void b53_disable_port(struct dsa_switch *ds, int port);
 void b53_brcm_hdr_setup(struct dsa_switch *ds, int port);
 int b53_eee_init(struct dsa_switch *ds, int port, struct phy_device *phy);
 bool b53_support_eee(struct dsa_switch *ds, int port);
-int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e);
 int b53_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e);
 
 #endif
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index a53fb6191e6b..fa2bf3fa9019 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -1233,7 +1233,6 @@ static const struct dsa_switch_ops bcm_sf2_ops = {
 	.port_enable		= bcm_sf2_port_setup,
 	.port_disable		= bcm_sf2_port_disable,
 	.support_eee		= b53_support_eee,
-	.get_mac_eee		= b53_get_mac_eee,
 	.set_mac_eee		= b53_set_mac_eee,
 	.port_bridge_join	= b53_br_join,
 	.port_bridge_leave	= b53_br_leave,
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index e3512e324572..89f0796894af 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -3489,20 +3489,6 @@ static bool ksz_support_eee(struct dsa_switch *ds, int port)
 	return false;
 }
 
-static int ksz_get_mac_eee(struct dsa_switch *ds, int port,
-			   struct ethtool_keee *e)
-{
-	/* There is no documented control of Tx LPI configuration. */
-	e->tx_lpi_enabled = true;
-
-	/* There is no documented control of Tx LPI timer. According to tests
-	 * Tx LPI timer seems to be set by default to minimal value.
-	 */
-	e->tx_lpi_timer = 0;
-
-	return 0;
-}
-
 static int ksz_set_mac_eee(struct dsa_switch *ds, int port,
 			   struct ethtool_keee *e)
 {
@@ -4672,7 +4658,6 @@ static const struct dsa_switch_ops ksz_switch_ops = {
 	.cls_flower_del		= ksz_cls_flower_del,
 	.port_setup_tc		= ksz_setup_tc,
 	.support_eee		= ksz_support_eee,
-	.get_mac_eee		= ksz_get_mac_eee,
 	.set_mac_eee		= ksz_set_mac_eee,
 	.port_get_default_prio	= ksz_port_get_default_prio,
 	.port_set_default_prio	= ksz_port_set_default_prio,
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index 9605febd3573..d2d0f091e49e 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -3085,18 +3085,6 @@ mt753x_setup(struct dsa_switch *ds)
 	return ret;
 }
 
-static int mt753x_get_mac_eee(struct dsa_switch *ds, int port,
-			      struct ethtool_keee *e)
-{
-	struct mt7530_priv *priv = ds->priv;
-	u32 eeecr = mt7530_read(priv, MT753X_PMEEECR_P(port));
-
-	e->tx_lpi_enabled = !(eeecr & LPI_MODE_EN);
-	e->tx_lpi_timer = LPI_THRESH_GET(eeecr);
-
-	return 0;
-}
-
 static int mt753x_set_mac_eee(struct dsa_switch *ds, int port,
 			      struct ethtool_keee *e)
 {
@@ -3239,7 +3227,6 @@ const struct dsa_switch_ops mt7530_switch_ops = {
 	.port_mirror_del	= mt753x_port_mirror_del,
 	.phylink_get_caps	= mt753x_phylink_get_caps,
 	.support_eee		= dsa_supports_eee,
-	.get_mac_eee		= mt753x_get_mac_eee,
 	.set_mac_eee		= mt753x_set_mac_eee,
 	.conduit_state_change	= mt753x_conduit_state_change,
 	.port_setup_tc		= mt753x_setup_tc,
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 570c8642d387..35ae084af166 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1513,13 +1513,6 @@ static void mv88e6xxx_get_regs(struct dsa_switch *ds, int port,
 	mv88e6xxx_reg_unlock(chip);
 }
 
-static int mv88e6xxx_get_mac_eee(struct dsa_switch *ds, int port,
-				 struct ethtool_keee *e)
-{
-	/* Nothing to do on the port's MAC */
-	return 0;
-}
-
 static int mv88e6xxx_set_mac_eee(struct dsa_switch *ds, int port,
 				 struct ethtool_keee *e)
 {
@@ -7100,7 +7093,6 @@ static const struct dsa_switch_ops mv88e6xxx_switch_ops = {
 	.port_max_mtu		= mv88e6xxx_get_max_mtu,
 	.port_change_mtu	= mv88e6xxx_change_mtu,
 	.support_eee		= dsa_supports_eee,
-	.get_mac_eee		= mv88e6xxx_get_mac_eee,
 	.set_mac_eee		= mv88e6xxx_set_mac_eee,
 	.get_eeprom_len		= mv88e6xxx_get_eeprom_len,
 	.get_eeprom		= mv88e6xxx_get_eeprom,
diff --git a/drivers/net/dsa/qca/qca8k-8xxx.c b/drivers/net/dsa/qca/qca8k-8xxx.c
index 90e24bc00b99..750fc76a6e11 100644
--- a/drivers/net/dsa/qca/qca8k-8xxx.c
+++ b/drivers/net/dsa/qca/qca8k-8xxx.c
@@ -1019,7 +1019,7 @@ qca8k_setup_mdio_bus(struct qca8k_priv *priv)
 
 		of_get_phy_mode(port, &mode);
 
-		if (of_property_read_bool(port, "phy-handle") &&
+		if (of_property_present(port, "phy-handle") &&
 		    mode != PHY_INTERFACE_MODE_INTERNAL)
 			external_mdio_mask |= BIT(reg);
 		else
@@ -2017,7 +2017,6 @@ static const struct dsa_switch_ops qca8k_switch_ops = {
 	.get_sset_count		= qca8k_get_sset_count,
 	.set_ageing_time	= qca8k_set_ageing_time,
 	.support_eee		= dsa_supports_eee,
-	.get_mac_eee		= qca8k_get_mac_eee,
 	.set_mac_eee		= qca8k_set_mac_eee,
 	.port_enable		= qca8k_port_enable,
 	.port_disable		= qca8k_port_disable,
diff --git a/drivers/net/dsa/qca/qca8k-common.c b/drivers/net/dsa/qca/qca8k-common.c
index 560c74c4ac3d..13005f10edb7 100644
--- a/drivers/net/dsa/qca/qca8k-common.c
+++ b/drivers/net/dsa/qca/qca8k-common.c
@@ -557,13 +557,6 @@ exit:
 	return ret;
 }
 
-int qca8k_get_mac_eee(struct dsa_switch *ds, int port,
-		      struct ethtool_keee *e)
-{
-	/* Nothing to do on the port's MAC */
-	return 0;
-}
-
 static int qca8k_port_configure_learning(struct dsa_switch *ds, int port,
 					 bool learning)
 {
diff --git a/drivers/net/dsa/qca/qca8k.h b/drivers/net/dsa/qca/qca8k.h
index 24962a395754..d046679265fa 100644
--- a/drivers/net/dsa/qca/qca8k.h
+++ b/drivers/net/dsa/qca/qca8k.h
@@ -520,7 +520,6 @@ int qca8k_get_sset_count(struct dsa_switch *ds, int port, int sset);
 
 /* Common eee function */
 int qca8k_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *eee);
-int qca8k_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e);
 
 /* Common bridge function */
 void qca8k_port_stp_state_set(struct dsa_switch *ds, int port, u8 state);
diff --git a/drivers/net/ethernet/amd/pds_core/devlink.c b/drivers/net/ethernet/amd/pds_core/devlink.c
index 2681889162a2..44971e71991f 100644
--- a/drivers/net/ethernet/amd/pds_core/devlink.c
+++ b/drivers/net/ethernet/amd/pds_core/devlink.c
@@ -118,7 +118,7 @@ int pdsc_dl_info_get(struct devlink *dl, struct devlink_info_req *req,
 	if (err && err != -EIO)
 		return err;
 
-	listlen = fw_list.num_fw_slots;
+	listlen = min(fw_list.num_fw_slots, ARRAY_SIZE(fw_list.fw_names));
 	for (i = 0; i < listlen; i++) {
 		if (i < ARRAY_SIZE(fw_slotnames))
 			strscpy(buf, fw_slotnames[i], sizeof(buf));
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 46edea75e062..884d42db5554 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -2897,6 +2897,13 @@ static int bnxt_hwrm_handler(struct bnxt *bp, struct tx_cmp *txcmp)
 	return 0;
 }
 
+static bool bnxt_vnic_is_active(struct bnxt *bp)
+{
+	struct bnxt_vnic_info *vnic = &bp->vnic_info[0];
+
+	return vnic->fw_vnic_id != INVALID_HW_RING_ID && vnic->mru > 0;
+}
+
 static irqreturn_t bnxt_msix(int irq, void *dev_instance)
 {
 	struct bnxt_napi *bnapi = dev_instance;
@@ -3164,7 +3171,7 @@ static int bnxt_poll(struct napi_struct *napi, int budget)
 			break;
 		}
 	}
-	if (bp->flags & BNXT_FLAG_DIM) {
+	if ((bp->flags & BNXT_FLAG_DIM) && bnxt_vnic_is_active(bp)) {
 		struct dim_sample dim_sample = {};
 
 		dim_update_sample(cpr->event_ctr,
@@ -3295,7 +3302,7 @@ static int bnxt_poll_p5(struct napi_struct *napi, int budget)
 poll_done:
 	cpr_rx = &cpr->cp_ring_arr[0];
 	if (cpr_rx->cp_ring_type == BNXT_NQ_HDL_TYPE_RX &&
-	    (bp->flags & BNXT_FLAG_DIM)) {
+	    (bp->flags & BNXT_FLAG_DIM) && bnxt_vnic_is_active(bp)) {
 		struct dim_sample dim_sample = {};
 
 		dim_update_sample(cpr->event_ctr,
@@ -7266,6 +7273,26 @@ err_out:
 	return rc;
 }
 
+static void bnxt_cancel_dim(struct bnxt *bp)
+{
+	int i;
+
+	/* DIM work is initialized in bnxt_enable_napi().  Proceed only
+	 * if NAPI is enabled.
+	 */
+	if (!bp->bnapi || test_bit(BNXT_STATE_NAPI_DISABLED, &bp->state))
+		return;
+
+	/* Make sure NAPI sees that the VNIC is disabled */
+	synchronize_net();
+	for (i = 0; i < bp->rx_nr_rings; i++) {
+		struct bnxt_rx_ring_info *rxr = &bp->rx_ring[i];
+		struct bnxt_napi *bnapi = rxr->bnapi;
+
+		cancel_work_sync(&bnapi->cp_ring.dim.work);
+	}
+}
+
 static int hwrm_ring_free_send_msg(struct bnxt *bp,
 				   struct bnxt_ring_struct *ring,
 				   u32 ring_type, int cmpl_ring_id)
@@ -7366,6 +7393,7 @@ static void bnxt_hwrm_ring_free(struct bnxt *bp, bool close_path)
 		}
 	}
 
+	bnxt_cancel_dim(bp);
 	for (i = 0; i < bp->rx_nr_rings; i++) {
 		bnxt_hwrm_rx_ring_free(bp, &bp->rx_ring[i], close_path);
 		bnxt_hwrm_rx_agg_ring_free(bp, &bp->rx_ring[i], close_path);
@@ -11330,8 +11358,6 @@ static void bnxt_disable_napi(struct bnxt *bp)
 		if (bnapi->in_reset)
 			cpr->sw_stats->rx.rx_resets++;
 		napi_disable(&bnapi->napi);
-		if (bnapi->rx_ring)
-			cancel_work_sync(&cpr->dim.work);
 	}
 }
 
@@ -15613,8 +15639,10 @@ static int bnxt_queue_stop(struct net_device *dev, void *qmem, int idx)
 		bnxt_hwrm_vnic_update(bp, vnic,
 				      VNIC_UPDATE_REQ_ENABLES_MRU_VALID);
 	}
-
+	/* Make sure NAPI sees that the VNIC is disabled */
+	synchronize_net();
 	rxr = &bp->rx_ring[idx];
+	cancel_work_sync(&rxr->bnapi->cp_ring.dim.work);
 	bnxt_hwrm_rx_ring_free(bp, rxr, false);
 	bnxt_hwrm_rx_agg_ring_free(bp, rxr, false);
 	rxr->rx_next_cons = 0;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
index 94c6a0928ca0..7c88b9f05c4c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
@@ -208,7 +208,7 @@ int bnxt_send_msg(struct bnxt_en_dev *edev,
 
 	rc = hwrm_req_replace(bp, req, fw_msg->msg, fw_msg->msg_len);
 	if (rc)
-		return rc;
+		goto drop_req;
 
 	hwrm_req_timeout(bp, req, fw_msg->timeout);
 	resp = hwrm_req_hold(bp, req);
@@ -220,6 +220,7 @@ int bnxt_send_msg(struct bnxt_en_dev *edev,
 
 		memcpy(fw_msg->resp, resp, resp_len);
 	}
+drop_req:
 	hwrm_req_drop(bp, req);
 	return rc;
 }
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index bc3af0054406..2f0b3e389e62 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -1799,7 +1799,10 @@ void cxgb4_remove_tid(struct tid_info *t, unsigned int chan, unsigned int tid,
 	struct adapter *adap = container_of(t, struct adapter, tids);
 	struct sk_buff *skb;
 
-	WARN_ON(tid_out_of_range(&adap->tids, tid));
+	if (tid_out_of_range(&adap->tids, tid)) {
+		dev_err(adap->pdev_dev, "tid %d out of range\n", tid);
+		return;
+	}
 
 	if (t->tid_tab[tid - adap->tids.tid_base]) {
 		t->tid_tab[tid - adap->tids.tid_base] = NULL;
@@ -6559,6 +6562,9 @@ static void cxgb4_advance_esn_state(struct xfrm_state *x)
 {
 	struct adapter *adap = netdev2adap(x->xso.dev);
 
+	if (x->xso.dir != XFRM_DEV_OFFLOAD_IN)
+		return;
+
 	if (!mutex_trylock(&uld_mutex)) {
 		dev_dbg(adap->pdev_dev,
 			"crypto uld critical resource is under use\n");
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index 9913952ccb42..49f6cab01ed5 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -109,7 +109,7 @@ static struct enic_intr_mod_table mod_table[ENIC_MAX_COALESCE_TIMERS + 1] = {
 static struct enic_intr_mod_range mod_range[ENIC_MAX_LINK_SPEEDS] = {
 	{0,  0}, /* 0  - 4  Gbps */
 	{0,  3}, /* 4  - 10 Gbps */
-	{3,  6}, /* 10 - 40 Gbps */
+	{3,  6}, /* 10+ Gbps */
 };
 
 static void enic_init_affinity_hint(struct enic *enic)
@@ -428,6 +428,36 @@ static void enic_mtu_check(struct enic *enic)
 	}
 }
 
+static void enic_set_rx_coal_setting(struct enic *enic)
+{
+	unsigned int speed;
+	int index = -1;
+	struct enic_rx_coal *rx_coal = &enic->rx_coalesce_setting;
+
+	/* 1. Read the link speed from fw
+	 * 2. Pick the default range for the speed
+	 * 3. Update it in enic->rx_coalesce_setting
+	 */
+	speed = vnic_dev_port_speed(enic->vdev);
+	if (speed > ENIC_LINK_SPEED_10G)
+		index = ENIC_LINK_40G_INDEX;
+	else if (speed > ENIC_LINK_SPEED_4G)
+		index = ENIC_LINK_10G_INDEX;
+	else
+		index = ENIC_LINK_4G_INDEX;
+
+	rx_coal->small_pkt_range_start = mod_range[index].small_pkt_range_start;
+	rx_coal->large_pkt_range_start = mod_range[index].large_pkt_range_start;
+	rx_coal->range_end = ENIC_RX_COALESCE_RANGE_END;
+
+	/* Start with the value provided by UCSM */
+	for (index = 0; index < enic->rq_count; index++)
+		enic->cq[index].cur_rx_coal_timeval =
+				enic->config.intr_timer_usec;
+
+	rx_coal->use_adaptive_rx_coalesce = 1;
+}
+
 static void enic_link_check(struct enic *enic)
 {
 	int link_status = vnic_dev_link_status(enic->vdev);
@@ -436,6 +466,7 @@ static void enic_link_check(struct enic *enic)
 	if (link_status && !carrier_ok) {
 		netdev_info(enic->netdev, "Link UP\n");
 		netif_carrier_on(enic->netdev);
+		enic_set_rx_coal_setting(enic);
 	} else if (!link_status && carrier_ok) {
 		netdev_info(enic->netdev, "Link DOWN\n");
 		netif_carrier_off(enic->netdev);
@@ -1901,36 +1932,6 @@ static void enic_synchronize_irqs(struct enic *enic)
 	}
 }
 
-static void enic_set_rx_coal_setting(struct enic *enic)
-{
-	unsigned int speed;
-	int index = -1;
-	struct enic_rx_coal *rx_coal = &enic->rx_coalesce_setting;
-
-	/* 1. Read the link speed from fw
-	 * 2. Pick the default range for the speed
-	 * 3. Update it in enic->rx_coalesce_setting
-	 */
-	speed = vnic_dev_port_speed(enic->vdev);
-	if (ENIC_LINK_SPEED_10G < speed)
-		index = ENIC_LINK_40G_INDEX;
-	else if (ENIC_LINK_SPEED_4G < speed)
-		index = ENIC_LINK_10G_INDEX;
-	else
-		index = ENIC_LINK_4G_INDEX;
-
-	rx_coal->small_pkt_range_start = mod_range[index].small_pkt_range_start;
-	rx_coal->large_pkt_range_start = mod_range[index].large_pkt_range_start;
-	rx_coal->range_end = ENIC_RX_COALESCE_RANGE_END;
-
-	/* Start with the value provided by UCSM */
-	for (index = 0; index < enic->rq_count; index++)
-		enic->cq[index].cur_rx_coal_timeval =
-				enic->config.intr_timer_usec;
-
-	rx_coal->use_adaptive_rx_coalesce = 1;
-}
-
 static int enic_dev_notify_set(struct enic *enic)
 {
 	int err;
@@ -3063,7 +3064,6 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	timer_setup(&enic->notify_timer, enic_notify_timer, 0);
 
 	enic_rfs_flw_tbl_init(enic);
-	enic_set_rx_coal_setting(enic);
 	INIT_WORK(&enic->reset, enic_reset);
 	INIT_WORK(&enic->tx_hang_reset, enic_tx_hang_reset);
 	INIT_WORK(&enic->change_mtu_work, enic_change_mtu_work);
diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c
index f47f8177a93b..88510f822759 100644
--- a/drivers/net/ethernet/freescale/ucc_geth.c
+++ b/drivers/net/ethernet/freescale/ucc_geth.c
@@ -1434,7 +1434,7 @@ static void ugeth_activate(struct ucc_geth_private *ugeth)
 
 	/* allow to xmit again  */
 	netif_tx_wake_all_queues(ugeth->ndev);
-	__netdev_watchdog_up(ugeth->ndev);
+	netdev_watchdog_up(ugeth->ndev);
 }
 
 /* Initialize TBI PHY interface for communicating with the
diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
index 8a8f6ab12a98..533e659b15b3 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -2241,14 +2241,18 @@ static void gve_service_task(struct work_struct *work)
 
 static void gve_set_netdev_xdp_features(struct gve_priv *priv)
 {
+	xdp_features_t xdp_features;
+
 	if (priv->queue_format == GVE_GQI_QPL_FORMAT) {
-		priv->dev->xdp_features = NETDEV_XDP_ACT_BASIC;
-		priv->dev->xdp_features |= NETDEV_XDP_ACT_REDIRECT;
-		priv->dev->xdp_features |= NETDEV_XDP_ACT_NDO_XMIT;
-		priv->dev->xdp_features |= NETDEV_XDP_ACT_XSK_ZEROCOPY;
+		xdp_features = NETDEV_XDP_ACT_BASIC;
+		xdp_features |= NETDEV_XDP_ACT_REDIRECT;
+		xdp_features |= NETDEV_XDP_ACT_NDO_XMIT;
+		xdp_features |= NETDEV_XDP_ACT_XSK_ZEROCOPY;
 	} else {
-		priv->dev->xdp_features = 0;
+		xdp_features = 0;
 	}
+
+	xdp_set_features_flag(priv->dev, xdp_features);
 }
 
 static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index 710a8f9f2248..12ba380eb701 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -916,9 +916,6 @@ struct hnae3_handle {
 
 	u8 netdev_flags;
 	struct dentry *hnae3_dbgfs;
-	/* protects concurrent contention between debugfs commands */
-	struct mutex dbgfs_lock;
-	char **dbgfs_buf;
 
 	/* Network interface message level enabled bits */
 	u32 msg_enable;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
index 807eb3bbb11c..9bbece25552b 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
@@ -1260,69 +1260,55 @@ static int hns3_dbg_read_cmd(struct hns3_dbg_data *dbg_data,
 static ssize_t hns3_dbg_read(struct file *filp, char __user *buffer,
 			     size_t count, loff_t *ppos)
 {
-	struct hns3_dbg_data *dbg_data = filp->private_data;
+	char *buf = filp->private_data;
+
+	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
+}
+
+static int hns3_dbg_open(struct inode *inode, struct file *filp)
+{
+	struct hns3_dbg_data *dbg_data = inode->i_private;
 	struct hnae3_handle *handle = dbg_data->handle;
 	struct hns3_nic_priv *priv = handle->priv;
-	ssize_t size = 0;
-	char **save_buf;
-	char *read_buf;
 	u32 index;
+	char *buf;
 	int ret;
 
+	if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state) ||
+	    test_bit(HNS3_NIC_STATE_RESETTING, &priv->state))
+		return -EBUSY;
+
 	ret = hns3_dbg_get_cmd_index(dbg_data, &index);
 	if (ret)
 		return ret;
 
-	mutex_lock(&handle->dbgfs_lock);
-	save_buf = &handle->dbgfs_buf[index];
-
-	if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state) ||
-	    test_bit(HNS3_NIC_STATE_RESETTING, &priv->state)) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	if (*save_buf) {
-		read_buf = *save_buf;
-	} else {
-		read_buf = kvzalloc(hns3_dbg_cmd[index].buf_len, GFP_KERNEL);
-		if (!read_buf) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		/* save the buffer addr until the last read operation */
-		*save_buf = read_buf;
-
-		/* get data ready for the first time to read */
-		ret = hns3_dbg_read_cmd(dbg_data, hns3_dbg_cmd[index].cmd,
-					read_buf, hns3_dbg_cmd[index].buf_len);
-		if (ret)
-			goto out;
-	}
+	buf = kvzalloc(hns3_dbg_cmd[index].buf_len, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
 
-	size = simple_read_from_buffer(buffer, count, ppos, read_buf,
-				       strlen(read_buf));
-	if (size > 0) {
-		mutex_unlock(&handle->dbgfs_lock);
-		return size;
+	ret = hns3_dbg_read_cmd(dbg_data, hns3_dbg_cmd[index].cmd,
+				buf, hns3_dbg_cmd[index].buf_len);
+	if (ret) {
+		kvfree(buf);
+		return ret;
 	}
 
-out:
-	/* free the buffer for the last read operation */
-	if (*save_buf) {
-		kvfree(*save_buf);
-		*save_buf = NULL;
-	}
+	filp->private_data = buf;
+	return 0;
+}
 
-	mutex_unlock(&handle->dbgfs_lock);
-	return ret;
+static int hns3_dbg_release(struct inode *inode, struct file *filp)
+{
+	kvfree(filp->private_data);
+	filp->private_data = NULL;
+	return 0;
 }
 
 static const struct file_operations hns3_dbg_fops = {
 	.owner = THIS_MODULE,
-	.open  = simple_open,
+	.open  = hns3_dbg_open,
 	.read  = hns3_dbg_read,
+	.release = hns3_dbg_release,
 };
 
 static int hns3_dbg_bd_file_init(struct hnae3_handle *handle, u32 cmd)
@@ -1379,13 +1365,6 @@ int hns3_dbg_init(struct hnae3_handle *handle)
 	int ret;
 	u32 i;
 
-	handle->dbgfs_buf = devm_kcalloc(&handle->pdev->dev,
-					 ARRAY_SIZE(hns3_dbg_cmd),
-					 sizeof(*handle->dbgfs_buf),
-					 GFP_KERNEL);
-	if (!handle->dbgfs_buf)
-		return -ENOMEM;
-
 	hns3_dbg_dentry[HNS3_DBG_DENTRY_COMMON].dentry =
 				debugfs_create_dir(name, hns3_dbgfs_root);
 	handle->hnae3_dbgfs = hns3_dbg_dentry[HNS3_DBG_DENTRY_COMMON].dentry;
@@ -1395,8 +1374,6 @@ int hns3_dbg_init(struct hnae3_handle *handle)
 			debugfs_create_dir(hns3_dbg_dentry[i].name,
 					   handle->hnae3_dbgfs);
 
-	mutex_init(&handle->dbgfs_lock);
-
 	for (i = 0; i < ARRAY_SIZE(hns3_dbg_cmd); i++) {
 		if ((hns3_dbg_cmd[i].cmd == HNAE3_DBG_CMD_TM_NODES &&
 		     ae_dev->dev_version <= HNAE3_DEVICE_VERSION_V2) ||
@@ -1425,24 +1402,13 @@ int hns3_dbg_init(struct hnae3_handle *handle)
 out:
 	debugfs_remove_recursive(handle->hnae3_dbgfs);
 	handle->hnae3_dbgfs = NULL;
-	mutex_destroy(&handle->dbgfs_lock);
 	return ret;
 }
 
 void hns3_dbg_uninit(struct hnae3_handle *handle)
 {
-	u32 i;
-
 	debugfs_remove_recursive(handle->hnae3_dbgfs);
 	handle->hnae3_dbgfs = NULL;
-
-	for (i = 0; i < ARRAY_SIZE(hns3_dbg_cmd); i++)
-		if (handle->dbgfs_buf[i]) {
-			kvfree(handle->dbgfs_buf[i]);
-			handle->dbgfs_buf[i] = NULL;
-		}
-
-	mutex_destroy(&handle->dbgfs_lock);
 }
 
 void hns3_dbg_register_debugfs(const char *debugfs_dir_name)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 43377a7b2426..a7e3b22f641c 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -2452,7 +2452,6 @@ static int hns3_nic_set_features(struct net_device *netdev,
 			return ret;
 	}
 
-	netdev->features = features;
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 05942fa78b11..db7845009252 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -6,6 +6,7 @@
 #include <linux/etherdevice.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
@@ -3574,6 +3575,17 @@ static int hclge_set_vf_link_state(struct hnae3_handle *handle, int vf,
 	return ret;
 }
 
+static void hclge_set_reset_pending(struct hclge_dev *hdev,
+				    enum hnae3_reset_type reset_type)
+{
+	/* When an incorrect reset type is executed, the get_reset_level
+	 * function generates the HNAE3_NONE_RESET flag. As a result, this
+	 * type do not need to pending.
+	 */
+	if (reset_type != HNAE3_NONE_RESET)
+		set_bit(reset_type, &hdev->reset_pending);
+}
+
 static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
 {
 	u32 cmdq_src_reg, msix_src_reg, hw_err_src_reg;
@@ -3594,7 +3606,7 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
 	 */
 	if (BIT(HCLGE_VECTOR0_IMPRESET_INT_B) & msix_src_reg) {
 		dev_info(&hdev->pdev->dev, "IMP reset interrupt\n");
-		set_bit(HNAE3_IMP_RESET, &hdev->reset_pending);
+		hclge_set_reset_pending(hdev, HNAE3_IMP_RESET);
 		set_bit(HCLGE_COMM_STATE_CMD_DISABLE, &hdev->hw.hw.comm_state);
 		*clearval = BIT(HCLGE_VECTOR0_IMPRESET_INT_B);
 		hdev->rst_stats.imp_rst_cnt++;
@@ -3604,7 +3616,7 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
 	if (BIT(HCLGE_VECTOR0_GLOBALRESET_INT_B) & msix_src_reg) {
 		dev_info(&hdev->pdev->dev, "global reset interrupt\n");
 		set_bit(HCLGE_COMM_STATE_CMD_DISABLE, &hdev->hw.hw.comm_state);
-		set_bit(HNAE3_GLOBAL_RESET, &hdev->reset_pending);
+		hclge_set_reset_pending(hdev, HNAE3_GLOBAL_RESET);
 		*clearval = BIT(HCLGE_VECTOR0_GLOBALRESET_INT_B);
 		hdev->rst_stats.global_rst_cnt++;
 		return HCLGE_VECTOR0_EVENT_RST;
@@ -3759,7 +3771,7 @@ static int hclge_misc_irq_init(struct hclge_dev *hdev)
 	snprintf(hdev->misc_vector.name, HNAE3_INT_NAME_LEN, "%s-misc-%s",
 		 HCLGE_NAME, pci_name(hdev->pdev));
 	ret = request_irq(hdev->misc_vector.vector_irq, hclge_misc_irq_handle,
-			  0, hdev->misc_vector.name, hdev);
+			  IRQF_NO_AUTOEN, hdev->misc_vector.name, hdev);
 	if (ret) {
 		hclge_free_vector(hdev, 0);
 		dev_err(&hdev->pdev->dev, "request misc irq(%d) fail\n",
@@ -4052,7 +4064,7 @@ static void hclge_do_reset(struct hclge_dev *hdev)
 	case HNAE3_FUNC_RESET:
 		dev_info(&pdev->dev, "PF reset requested\n");
 		/* schedule again to check later */
-		set_bit(HNAE3_FUNC_RESET, &hdev->reset_pending);
+		hclge_set_reset_pending(hdev, HNAE3_FUNC_RESET);
 		hclge_reset_task_schedule(hdev);
 		break;
 	default:
@@ -4086,6 +4098,8 @@ static enum hnae3_reset_type hclge_get_reset_level(struct hnae3_ae_dev *ae_dev,
 		clear_bit(HNAE3_FLR_RESET, addr);
 	}
 
+	clear_bit(HNAE3_NONE_RESET, addr);
+
 	if (hdev->reset_type != HNAE3_NONE_RESET &&
 	    rst_level < hdev->reset_type)
 		return HNAE3_NONE_RESET;
@@ -4227,7 +4241,7 @@ static bool hclge_reset_err_handle(struct hclge_dev *hdev)
 		return false;
 	} else if (hdev->rst_stats.reset_fail_cnt < MAX_RESET_FAIL_CNT) {
 		hdev->rst_stats.reset_fail_cnt++;
-		set_bit(hdev->reset_type, &hdev->reset_pending);
+		hclge_set_reset_pending(hdev, hdev->reset_type);
 		dev_info(&hdev->pdev->dev,
 			 "re-schedule reset task(%u)\n",
 			 hdev->rst_stats.reset_fail_cnt);
@@ -4470,8 +4484,20 @@ static void hclge_reset_event(struct pci_dev *pdev, struct hnae3_handle *handle)
 static void hclge_set_def_reset_request(struct hnae3_ae_dev *ae_dev,
 					enum hnae3_reset_type rst_type)
 {
+#define HCLGE_SUPPORT_RESET_TYPE \
+	(BIT(HNAE3_FLR_RESET) | BIT(HNAE3_FUNC_RESET) | \
+	BIT(HNAE3_GLOBAL_RESET) | BIT(HNAE3_IMP_RESET))
+
 	struct hclge_dev *hdev = ae_dev->priv;
 
+	if (!(BIT(rst_type) & HCLGE_SUPPORT_RESET_TYPE)) {
+		/* To prevent reset triggered by hclge_reset_event */
+		set_bit(HNAE3_NONE_RESET, &hdev->default_reset_request);
+		dev_warn(&hdev->pdev->dev, "unsupported reset type %d\n",
+			 rst_type);
+		return;
+	}
+
 	set_bit(rst_type, &hdev->default_reset_request);
 }
 
@@ -11881,9 +11907,6 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
 
 	hclge_init_rxd_adv_layout(hdev);
 
-	/* Enable MISC vector(vector0) */
-	hclge_enable_vector(&hdev->misc_vector, true);
-
 	ret = hclge_init_wol(hdev);
 	if (ret)
 		dev_warn(&pdev->dev,
@@ -11896,6 +11919,10 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
 	hclge_state_init(hdev);
 	hdev->last_reset_time = jiffies;
 
+	/* Enable MISC vector(vector0) */
+	enable_irq(hdev->misc_vector.vector_irq);
+	hclge_enable_vector(&hdev->misc_vector, true);
+
 	dev_info(&hdev->pdev->dev, "%s driver initialization finished.\n",
 		 HCLGE_DRIVER_NAME);
 
@@ -12301,7 +12328,7 @@ static void hclge_uninit_ae_dev(struct hnae3_ae_dev *ae_dev)
 
 	/* Disable MISC vector(vector0) */
 	hclge_enable_vector(&hdev->misc_vector, false);
-	synchronize_irq(hdev->misc_vector.vector_irq);
+	disable_irq(hdev->misc_vector.vector_irq);
 
 	/* Disable all hw interrupts */
 	hclge_config_mac_tnl_int(hdev, false);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c
index 5505caea88e9..bab16c2191b2 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c
@@ -58,6 +58,9 @@ bool hclge_ptp_set_tx_info(struct hnae3_handle *handle, struct sk_buff *skb)
 	struct hclge_dev *hdev = vport->back;
 	struct hclge_ptp *ptp = hdev->ptp;
 
+	if (!ptp)
+		return false;
+
 	if (!test_bit(HCLGE_PTP_FLAG_TX_EN, &ptp->flags) ||
 	    test_and_set_bit(HCLGE_STATE_PTP_TX_HANDLING, &hdev->state)) {
 		ptp->tx_skipped++;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_regs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_regs.c
index 43c1c18fa81f..8c057192aae6 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_regs.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_regs.c
@@ -510,9 +510,9 @@ out:
 static int hclge_fetch_pf_reg(struct hclge_dev *hdev, void *data,
 			      struct hnae3_knic_private_info *kinfo)
 {
-#define HCLGE_RING_REG_OFFSET		0x200
 #define HCLGE_RING_INT_REG_OFFSET	0x4
 
+	struct hnae3_queue *tqp;
 	int i, j, reg_num;
 	int data_num_sum;
 	u32 *reg = data;
@@ -533,10 +533,11 @@ static int hclge_fetch_pf_reg(struct hclge_dev *hdev, void *data,
 	reg_num = ARRAY_SIZE(ring_reg_addr_list);
 	for (j = 0; j < kinfo->num_tqps; j++) {
 		reg += hclge_reg_get_tlv(HCLGE_REG_TAG_RING, reg_num, reg);
+		tqp = kinfo->tqp[j];
 		for (i = 0; i < reg_num; i++)
-			*reg++ = hclge_read_dev(&hdev->hw,
-						ring_reg_addr_list[i] +
-						HCLGE_RING_REG_OFFSET * j);
+			*reg++ = readl_relaxed(tqp->io_base -
+					       HCLGE_TQP_REG_OFFSET +
+					       ring_reg_addr_list[i]);
 	}
 	data_num_sum += (reg_num + HCLGE_REG_TLV_SPACE) * kinfo->num_tqps;
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index 2f6ffb88e700..163c6e59ea4c 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -1393,6 +1393,17 @@ static int hclgevf_notify_roce_client(struct hclgevf_dev *hdev,
 	return ret;
 }
 
+static void hclgevf_set_reset_pending(struct hclgevf_dev *hdev,
+				      enum hnae3_reset_type reset_type)
+{
+	/* When an incorrect reset type is executed, the get_reset_level
+	 * function generates the HNAE3_NONE_RESET flag. As a result, this
+	 * type do not need to pending.
+	 */
+	if (reset_type != HNAE3_NONE_RESET)
+		set_bit(reset_type, &hdev->reset_pending);
+}
+
 static int hclgevf_reset_wait(struct hclgevf_dev *hdev)
 {
 #define HCLGEVF_RESET_WAIT_US	20000
@@ -1542,7 +1553,7 @@ static void hclgevf_reset_err_handle(struct hclgevf_dev *hdev)
 		hdev->rst_stats.rst_fail_cnt);
 
 	if (hdev->rst_stats.rst_fail_cnt < HCLGEVF_RESET_MAX_FAIL_CNT)
-		set_bit(hdev->reset_type, &hdev->reset_pending);
+		hclgevf_set_reset_pending(hdev, hdev->reset_type);
 
 	if (hclgevf_is_reset_pending(hdev)) {
 		set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state);
@@ -1662,6 +1673,8 @@ static enum hnae3_reset_type hclgevf_get_reset_level(unsigned long *addr)
 		clear_bit(HNAE3_FLR_RESET, addr);
 	}
 
+	clear_bit(HNAE3_NONE_RESET, addr);
+
 	return rst_level;
 }
 
@@ -1671,14 +1684,15 @@ static void hclgevf_reset_event(struct pci_dev *pdev,
 	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
 	struct hclgevf_dev *hdev = ae_dev->priv;
 
-	dev_info(&hdev->pdev->dev, "received reset request from VF enet\n");
-
 	if (hdev->default_reset_request)
 		hdev->reset_level =
 			hclgevf_get_reset_level(&hdev->default_reset_request);
 	else
 		hdev->reset_level = HNAE3_VF_FUNC_RESET;
 
+	dev_info(&hdev->pdev->dev, "received reset request from VF enet, reset level is %d\n",
+		 hdev->reset_level);
+
 	/* reset of this VF requested */
 	set_bit(HCLGEVF_RESET_REQUESTED, &hdev->reset_state);
 	hclgevf_reset_task_schedule(hdev);
@@ -1689,8 +1703,20 @@ static void hclgevf_reset_event(struct pci_dev *pdev,
 static void hclgevf_set_def_reset_request(struct hnae3_ae_dev *ae_dev,
 					  enum hnae3_reset_type rst_type)
 {
+#define HCLGEVF_SUPPORT_RESET_TYPE \
+	(BIT(HNAE3_VF_RESET) | BIT(HNAE3_VF_FUNC_RESET) | \
+	BIT(HNAE3_VF_PF_FUNC_RESET) | BIT(HNAE3_VF_FULL_RESET) | \
+	BIT(HNAE3_FLR_RESET) | BIT(HNAE3_VF_EXP_RESET))
+
 	struct hclgevf_dev *hdev = ae_dev->priv;
 
+	if (!(BIT(rst_type) & HCLGEVF_SUPPORT_RESET_TYPE)) {
+		/* To prevent reset triggered by hclge_reset_event */
+		set_bit(HNAE3_NONE_RESET, &hdev->default_reset_request);
+		dev_info(&hdev->pdev->dev, "unsupported reset type %d\n",
+			 rst_type);
+		return;
+	}
 	set_bit(rst_type, &hdev->default_reset_request);
 }
 
@@ -1847,14 +1873,14 @@ static void hclgevf_reset_service_task(struct hclgevf_dev *hdev)
 		 */
 		if (hdev->reset_attempts > HCLGEVF_MAX_RESET_ATTEMPTS_CNT) {
 			/* prepare for full reset of stack + pcie interface */
-			set_bit(HNAE3_VF_FULL_RESET, &hdev->reset_pending);
+			hclgevf_set_reset_pending(hdev, HNAE3_VF_FULL_RESET);
 
 			/* "defer" schedule the reset task again */
 			set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state);
 		} else {
 			hdev->reset_attempts++;
 
-			set_bit(hdev->reset_level, &hdev->reset_pending);
+			hclgevf_set_reset_pending(hdev, hdev->reset_level);
 			set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state);
 		}
 		hclgevf_reset_task_schedule(hdev);
@@ -1977,7 +2003,7 @@ static enum hclgevf_evt_cause hclgevf_check_evt_cause(struct hclgevf_dev *hdev,
 		rst_ing_reg = hclgevf_read_dev(&hdev->hw, HCLGEVF_RST_ING);
 		dev_info(&hdev->pdev->dev,
 			 "receive reset interrupt 0x%x!\n", rst_ing_reg);
-		set_bit(HNAE3_VF_RESET, &hdev->reset_pending);
+		hclgevf_set_reset_pending(hdev, HNAE3_VF_RESET);
 		set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state);
 		set_bit(HCLGE_COMM_STATE_CMD_DISABLE, &hdev->hw.hw.comm_state);
 		*clearval = ~(1U << HCLGEVF_VECTOR0_RST_INT_B);
@@ -2287,6 +2313,8 @@ static void hclgevf_state_init(struct hclgevf_dev *hdev)
 	clear_bit(HCLGEVF_STATE_RST_FAIL, &hdev->state);
 
 	INIT_DELAYED_WORK(&hdev->service_task, hclgevf_service_task);
+	/* timer needs to be initialized before misc irq */
+	timer_setup(&hdev->reset_timer, hclgevf_reset_timer, 0);
 
 	mutex_init(&hdev->mbx_resp.mbx_mutex);
 	sema_init(&hdev->reset_sem, 1);
@@ -2986,7 +3014,6 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev)
 		 HCLGEVF_DRIVER_NAME);
 
 	hclgevf_task_schedule(hdev, round_jiffies_relative(HZ));
-	timer_setup(&hdev->reset_timer, hclgevf_reset_timer, 0);
 
 	return 0;
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_regs.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_regs.c
index 6db415d8b917..7d9d9dbc7560 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_regs.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_regs.c
@@ -123,10 +123,10 @@ int hclgevf_get_regs_len(struct hnae3_handle *handle)
 void hclgevf_get_regs(struct hnae3_handle *handle, u32 *version,
 		      void *data)
 {
-#define HCLGEVF_RING_REG_OFFSET		0x200
 #define HCLGEVF_RING_INT_REG_OFFSET	0x4
 
 	struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle);
+	struct hnae3_queue *tqp;
 	int i, j, reg_um;
 	u32 *reg = data;
 
@@ -147,10 +147,11 @@ void hclgevf_get_regs(struct hnae3_handle *handle, u32 *version,
 	reg_um = ARRAY_SIZE(ring_reg_addr_list);
 	for (j = 0; j < hdev->num_tqps; j++) {
 		reg += hclgevf_reg_get_tlv(HCLGEVF_REG_TAG_RING, reg_um, reg);
+		tqp = &hdev->htqp[j].q;
 		for (i = 0; i < reg_um; i++)
-			*reg++ = hclgevf_read_dev(&hdev->hw,
-						  ring_reg_addr_list[i] +
-						  HCLGEVF_RING_REG_OFFSET * j);
+			*reg++ = readl_relaxed(tqp->io_base -
+					       HCLGEVF_TQP_REG_OFFSET +
+					       ring_reg_addr_list[i]);
 	}
 
 	reg_um = ARRAY_SIZE(tqp_intr_reg_addr_list);
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
index 98861cc6df7c..b9dd7b719832 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
@@ -1180,126 +1180,6 @@ s32 fm10k_iov_select_vid(struct fm10k_vf_info *vf_info, u16 vid)
 }
 
 /**
- *  fm10k_iov_msg_mac_vlan_pf - Message handler for MAC/VLAN request from VF
- *  @hw: Pointer to hardware structure
- *  @results: Pointer array to message, results[0] is pointer to message
- *  @mbx: Pointer to mailbox information structure
- *
- *  This function is a default handler for MAC/VLAN requests from the VF.
- *  The assumption is that in this case it is acceptable to just directly
- *  hand off the message from the VF to the underlying shared code.
- **/
-s32 fm10k_iov_msg_mac_vlan_pf(struct fm10k_hw *hw, u32 **results,
-			      struct fm10k_mbx_info *mbx)
-{
-	struct fm10k_vf_info *vf_info = (struct fm10k_vf_info *)mbx;
-	u8 mac[ETH_ALEN];
-	u32 *result;
-	int err = 0;
-	bool set;
-	u16 vlan;
-	u32 vid;
-
-	/* we shouldn't be updating rules on a disabled interface */
-	if (!FM10K_VF_FLAG_ENABLED(vf_info))
-		err = FM10K_ERR_PARAM;
-
-	if (!err && !!results[FM10K_MAC_VLAN_MSG_VLAN]) {
-		result = results[FM10K_MAC_VLAN_MSG_VLAN];
-
-		/* record VLAN id requested */
-		err = fm10k_tlv_attr_get_u32(result, &vid);
-		if (err)
-			return err;
-
-		set = !(vid & FM10K_VLAN_CLEAR);
-		vid &= ~FM10K_VLAN_CLEAR;
-
-		/* if the length field has been set, this is a multi-bit
-		 * update request. For multi-bit requests, simply disallow
-		 * them when the pf_vid has been set. In this case, the PF
-		 * should have already cleared the VLAN_TABLE, and if we
-		 * allowed them, it could allow a rogue VF to receive traffic
-		 * on a VLAN it was not assigned. In the single-bit case, we
-		 * need to modify requests for VLAN 0 to use the default PF or
-		 * SW vid when assigned.
-		 */
-
-		if (vid >> 16) {
-			/* prevent multi-bit requests when PF has
-			 * administratively set the VLAN for this VF
-			 */
-			if (vf_info->pf_vid)
-				return FM10K_ERR_PARAM;
-		} else {
-			err = fm10k_iov_select_vid(vf_info, (u16)vid);
-			if (err < 0)
-				return err;
-
-			vid = err;
-		}
-
-		/* update VSI info for VF in regards to VLAN table */
-		err = hw->mac.ops.update_vlan(hw, vid, vf_info->vsi, set);
-	}
-
-	if (!err && !!results[FM10K_MAC_VLAN_MSG_MAC]) {
-		result = results[FM10K_MAC_VLAN_MSG_MAC];
-
-		/* record unicast MAC address requested */
-		err = fm10k_tlv_attr_get_mac_vlan(result, mac, &vlan);
-		if (err)
-			return err;
-
-		/* block attempts to set MAC for a locked device */
-		if (is_valid_ether_addr(vf_info->mac) &&
-		    !ether_addr_equal(mac, vf_info->mac))
-			return FM10K_ERR_PARAM;
-
-		set = !(vlan & FM10K_VLAN_CLEAR);
-		vlan &= ~FM10K_VLAN_CLEAR;
-
-		err = fm10k_iov_select_vid(vf_info, vlan);
-		if (err < 0)
-			return err;
-
-		vlan = (u16)err;
-
-		/* notify switch of request for new unicast address */
-		err = hw->mac.ops.update_uc_addr(hw, vf_info->glort,
-						 mac, vlan, set, 0);
-	}
-
-	if (!err && !!results[FM10K_MAC_VLAN_MSG_MULTICAST]) {
-		result = results[FM10K_MAC_VLAN_MSG_MULTICAST];
-
-		/* record multicast MAC address requested */
-		err = fm10k_tlv_attr_get_mac_vlan(result, mac, &vlan);
-		if (err)
-			return err;
-
-		/* verify that the VF is allowed to request multicast */
-		if (!(vf_info->vf_flags & FM10K_VF_FLAG_MULTI_ENABLED))
-			return FM10K_ERR_PARAM;
-
-		set = !(vlan & FM10K_VLAN_CLEAR);
-		vlan &= ~FM10K_VLAN_CLEAR;
-
-		err = fm10k_iov_select_vid(vf_info, vlan);
-		if (err < 0)
-			return err;
-
-		vlan = (u16)err;
-
-		/* notify switch of request for new multicast address */
-		err = hw->mac.ops.update_mc_addr(hw, vf_info->glort,
-						 mac, vlan, set);
-	}
-
-	return err;
-}
-
-/**
  *  fm10k_iov_supported_xcast_mode_pf - Determine best match for xcast mode
  *  @vf_info: VF info structure containing capability flags
  *  @mode: Requested xcast mode
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pf.h b/drivers/net/ethernet/intel/fm10k/fm10k_pf.h
index 8e814df709d2..ad3696893cb1 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pf.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pf.h
@@ -99,8 +99,6 @@ extern const struct fm10k_tlv_attr fm10k_err_msg_attr[];
 
 s32 fm10k_iov_select_vid(struct fm10k_vf_info *vf_info, u16 vid);
 s32 fm10k_iov_msg_msix_pf(struct fm10k_hw *, u32 **, struct fm10k_mbx_info *);
-s32 fm10k_iov_msg_mac_vlan_pf(struct fm10k_hw *, u32 **,
-			      struct fm10k_mbx_info *);
 s32 fm10k_iov_msg_lport_state_pf(struct fm10k_hw *, u32 **,
 				 struct fm10k_mbx_info *);
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index d4255c2706fa..c67963bfe14e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -88,6 +88,7 @@ enum i40e_state {
 	__I40E_SERVICE_SCHED,
 	__I40E_ADMINQ_EVENT_PENDING,
 	__I40E_MDD_EVENT_PENDING,
+	__I40E_MDD_VF_PRINT_PENDING,
 	__I40E_VFLR_EVENT_PENDING,
 	__I40E_RESET_RECOVERY_PENDING,
 	__I40E_TIMEOUT_RECOVERY_PENDING,
@@ -191,6 +192,7 @@ enum i40e_pf_flags {
 	 */
 	I40E_FLAG_TOTAL_PORT_SHUTDOWN_ENA,
 	I40E_FLAG_VF_VLAN_PRUNING_ENA,
+	I40E_FLAG_MDD_AUTO_RESET_VF,
 	I40E_PF_FLAGS_NBITS,		/* must be last */
 };
 
@@ -572,7 +574,7 @@ struct i40e_pf {
 	int num_alloc_vfs;	/* actual number of VFs allocated */
 	u32 vf_aq_requests;
 	u32 arq_overflows;	/* Not fatal, possibly indicative of problems */
-
+	struct ratelimit_state mdd_message_rate_limit;
 	/* DCBx/DCBNL capability for PF that indicates
 	 * whether DCBx is managed by firmware or host
 	 * based agent (LLDPAD). Also, indicates what
@@ -1189,7 +1191,6 @@ int i40e_add_del_fdir(struct i40e_vsi *vsi,
 		      struct i40e_fdir_filter *input, bool add);
 void i40e_fdir_check_and_reenable(struct i40e_pf *pf);
 u32 i40e_get_current_fd_count(struct i40e_pf *pf);
-u32 i40e_get_cur_guaranteed_fd_count(struct i40e_pf *pf);
 u32 i40e_get_current_atr_cnt(struct i40e_pf *pf);
 u32 i40e_get_global_fd_count(struct i40e_pf *pf);
 bool i40e_set_ntuple(struct i40e_pf *pf, netdev_features_t features);
@@ -1197,7 +1198,6 @@ void i40e_set_ethtool_ops(struct net_device *netdev);
 struct i40e_mac_filter *i40e_add_filter(struct i40e_vsi *vsi,
 					const u8 *macaddr, s16 vlan);
 void __i40e_del_filter(struct i40e_vsi *vsi, struct i40e_mac_filter *f);
-void i40e_del_filter(struct i40e_vsi *vsi, const u8 *macaddr, s16 vlan);
 int i40e_sync_vsi_filters(struct i40e_vsi *vsi);
 struct i40e_vsi *i40e_vsi_setup(struct i40e_pf *pf, u8 type,
 				u16 uplink, u32 param1);
@@ -1313,7 +1313,6 @@ int i40e_update_adq_vsi_queues(struct i40e_vsi *vsi, int vsi_offset);
 int i40e_is_vsi_uplink_mode_veb(struct i40e_vsi *vsi);
 int i40e_get_partition_bw_setting(struct i40e_pf *pf);
 int i40e_set_partition_bw_setting(struct i40e_pf *pf);
-int i40e_commit_partition_bw_setting(struct i40e_pf *pf);
 void i40e_print_link_message(struct i40e_vsi *vsi, bool isup);
 
 void i40e_set_fec_in_flags(u8 fec_cfg, unsigned long *flags);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.c b/drivers/net/ethernet/intel/i40e/i40e_adminq.c
index f73f5930fc58..175c1320c143 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.c
@@ -1016,16 +1016,6 @@ i40e_asq_send_command_atomic_v2(struct i40e_hw *hw,
 	return status;
 }
 
-int
-i40e_asq_send_command_v2(struct i40e_hw *hw, struct i40e_aq_desc *desc,
-			 void *buff, /* can be NULL */ u16  buff_size,
-			 struct i40e_asq_cmd_details *cmd_details,
-			 enum i40e_admin_queue_err *aq_status)
-{
-	return i40e_asq_send_command_atomic_v2(hw, desc, buff, buff_size,
-					       cmd_details, true, aq_status);
-}
-
 /**
  *  i40e_fill_default_direct_cmd_desc - AQ descriptor helper function
  *  @desc:     pointer to the temp descriptor (non DMA mem)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c
index e8031f1a9b4f..370b4bddee44 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -1805,37 +1805,6 @@ int i40e_aq_set_vsi_broadcast(struct i40e_hw *hw,
 }
 
 /**
- * i40e_aq_set_vsi_vlan_promisc - control the VLAN promiscuous setting
- * @hw: pointer to the hw struct
- * @seid: vsi number
- * @enable: set MAC L2 layer unicast promiscuous enable/disable for a given VLAN
- * @cmd_details: pointer to command details structure or NULL
- **/
-int i40e_aq_set_vsi_vlan_promisc(struct i40e_hw *hw,
-				 u16 seid, bool enable,
-				 struct i40e_asq_cmd_details *cmd_details)
-{
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
-		(struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw;
-	u16 flags = 0;
-	int status;
-
-	i40e_fill_default_direct_cmd_desc(&desc,
-					i40e_aqc_opc_set_vsi_promiscuous_modes);
-	if (enable)
-		flags |= I40E_AQC_SET_VSI_PROMISC_VLAN;
-
-	cmd->promiscuous_flags = cpu_to_le16(flags);
-	cmd->valid_flags = cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_VLAN);
-	cmd->seid = cpu_to_le16(seid);
-
-	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
-
-	return status;
-}
-
-/**
  * i40e_aq_get_vsi_params - get VSI configuration info
  * @hw: pointer to the hw struct
  * @vsi_ctx: pointer to a vsi context struct
@@ -2436,136 +2405,6 @@ i40e_aq_remove_macvlan_v2(struct i40e_hw *hw, u16 seid,
 }
 
 /**
- * i40e_mirrorrule_op - Internal helper function to add/delete mirror rule
- * @hw: pointer to the hw struct
- * @opcode: AQ opcode for add or delete mirror rule
- * @sw_seid: Switch SEID (to which rule refers)
- * @rule_type: Rule Type (ingress/egress/VLAN)
- * @id: Destination VSI SEID or Rule ID
- * @count: length of the list
- * @mr_list: list of mirrored VSI SEIDs or VLAN IDs
- * @cmd_details: pointer to command details structure or NULL
- * @rule_id: Rule ID returned from FW
- * @rules_used: Number of rules used in internal switch
- * @rules_free: Number of rules free in internal switch
- *
- * Add/Delete a mirror rule to a specific switch. Mirror rules are supported for
- * VEBs/VEPA elements only
- **/
-static int i40e_mirrorrule_op(struct i40e_hw *hw,
-			      u16 opcode, u16 sw_seid, u16 rule_type, u16 id,
-			      u16 count, __le16 *mr_list,
-			      struct i40e_asq_cmd_details *cmd_details,
-			      u16 *rule_id, u16 *rules_used, u16 *rules_free)
-{
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_add_delete_mirror_rule *cmd =
-		(struct i40e_aqc_add_delete_mirror_rule *)&desc.params.raw;
-	struct i40e_aqc_add_delete_mirror_rule_completion *resp =
-	(struct i40e_aqc_add_delete_mirror_rule_completion *)&desc.params.raw;
-	u16 buf_size;
-	int status;
-
-	buf_size = count * sizeof(*mr_list);
-
-	/* prep the rest of the request */
-	i40e_fill_default_direct_cmd_desc(&desc, opcode);
-	cmd->seid = cpu_to_le16(sw_seid);
-	cmd->rule_type = cpu_to_le16(rule_type &
-				     I40E_AQC_MIRROR_RULE_TYPE_MASK);
-	cmd->num_entries = cpu_to_le16(count);
-	/* Dest VSI for add, rule_id for delete */
-	cmd->destination = cpu_to_le16(id);
-	if (mr_list) {
-		desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF |
-						I40E_AQ_FLAG_RD));
-		if (buf_size > I40E_AQ_LARGE_BUF)
-			desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
-	}
-
-	status = i40e_asq_send_command(hw, &desc, mr_list, buf_size,
-				       cmd_details);
-	if (!status ||
-	    hw->aq.asq_last_status == I40E_AQ_RC_ENOSPC) {
-		if (rule_id)
-			*rule_id = le16_to_cpu(resp->rule_id);
-		if (rules_used)
-			*rules_used = le16_to_cpu(resp->mirror_rules_used);
-		if (rules_free)
-			*rules_free = le16_to_cpu(resp->mirror_rules_free);
-	}
-	return status;
-}
-
-/**
- * i40e_aq_add_mirrorrule - add a mirror rule
- * @hw: pointer to the hw struct
- * @sw_seid: Switch SEID (to which rule refers)
- * @rule_type: Rule Type (ingress/egress/VLAN)
- * @dest_vsi: SEID of VSI to which packets will be mirrored
- * @count: length of the list
- * @mr_list: list of mirrored VSI SEIDs or VLAN IDs
- * @cmd_details: pointer to command details structure or NULL
- * @rule_id: Rule ID returned from FW
- * @rules_used: Number of rules used in internal switch
- * @rules_free: Number of rules free in internal switch
- *
- * Add mirror rule. Mirror rules are supported for VEBs or VEPA elements only
- **/
-int i40e_aq_add_mirrorrule(struct i40e_hw *hw, u16 sw_seid,
-			   u16 rule_type, u16 dest_vsi, u16 count,
-			   __le16 *mr_list,
-			   struct i40e_asq_cmd_details *cmd_details,
-			   u16 *rule_id, u16 *rules_used, u16 *rules_free)
-{
-	if (!(rule_type == I40E_AQC_MIRROR_RULE_TYPE_ALL_INGRESS ||
-	    rule_type == I40E_AQC_MIRROR_RULE_TYPE_ALL_EGRESS)) {
-		if (count == 0 || !mr_list)
-			return -EINVAL;
-	}
-
-	return i40e_mirrorrule_op(hw, i40e_aqc_opc_add_mirror_rule, sw_seid,
-				  rule_type, dest_vsi, count, mr_list,
-				  cmd_details, rule_id, rules_used, rules_free);
-}
-
-/**
- * i40e_aq_delete_mirrorrule - delete a mirror rule
- * @hw: pointer to the hw struct
- * @sw_seid: Switch SEID (to which rule refers)
- * @rule_type: Rule Type (ingress/egress/VLAN)
- * @count: length of the list
- * @rule_id: Rule ID that is returned in the receive desc as part of
- *		add_mirrorrule.
- * @mr_list: list of mirrored VLAN IDs to be removed
- * @cmd_details: pointer to command details structure or NULL
- * @rules_used: Number of rules used in internal switch
- * @rules_free: Number of rules free in internal switch
- *
- * Delete a mirror rule. Mirror rules are supported for VEBs/VEPA elements only
- **/
-int i40e_aq_delete_mirrorrule(struct i40e_hw *hw, u16 sw_seid,
-			      u16 rule_type, u16 rule_id, u16 count,
-			      __le16 *mr_list,
-			      struct i40e_asq_cmd_details *cmd_details,
-			      u16 *rules_used, u16 *rules_free)
-{
-	/* Rule ID has to be valid except rule_type: INGRESS VLAN mirroring */
-	if (rule_type == I40E_AQC_MIRROR_RULE_TYPE_VLAN) {
-		/* count and mr_list shall be valid for rule_type INGRESS VLAN
-		 * mirroring. For other rule_type, count and rule_type should
-		 * not matter.
-		 */
-		if (count == 0 || !mr_list)
-			return -EINVAL;
-	}
-
-	return i40e_mirrorrule_op(hw, i40e_aqc_opc_delete_mirror_rule, sw_seid,
-				  rule_type, rule_id, count, mr_list,
-				  cmd_details, NULL, rules_used, rules_free);
-}
-
-/**
  * i40e_aq_send_msg_to_vf
  * @hw: pointer to the hardware structure
  * @vfid: VF id to send msg
@@ -3180,41 +3019,6 @@ i40e_aq_update_nvm_exit:
 }
 
 /**
- * i40e_aq_rearrange_nvm
- * @hw: pointer to the hw struct
- * @rearrange_nvm: defines direction of rearrangement
- * @cmd_details: pointer to command details structure or NULL
- *
- * Rearrange NVM structure, available only for transition FW
- **/
-int i40e_aq_rearrange_nvm(struct i40e_hw *hw,
-			  u8 rearrange_nvm,
-			  struct i40e_asq_cmd_details *cmd_details)
-{
-	struct i40e_aqc_nvm_update *cmd;
-	struct i40e_aq_desc desc;
-	int status;
-
-	cmd = (struct i40e_aqc_nvm_update *)&desc.params.raw;
-
-	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_nvm_update);
-
-	rearrange_nvm &= (I40E_AQ_NVM_REARRANGE_TO_FLAT |
-			 I40E_AQ_NVM_REARRANGE_TO_STRUCT);
-
-	if (!rearrange_nvm) {
-		status = -EINVAL;
-		goto i40e_aq_rearrange_nvm_exit;
-	}
-
-	cmd->command_flags |= rearrange_nvm;
-	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
-
-i40e_aq_rearrange_nvm_exit:
-	return status;
-}
-
-/**
  * i40e_aq_get_lldp_mib
  * @hw: pointer to the hw struct
  * @bridge_type: type of bridge requested
@@ -3335,44 +3139,6 @@ int i40e_aq_cfg_lldp_mib_change_event(struct i40e_hw *hw,
 }
 
 /**
- * i40e_aq_restore_lldp
- * @hw: pointer to the hw struct
- * @setting: pointer to factory setting variable or NULL
- * @restore: True if factory settings should be restored
- * @cmd_details: pointer to command details structure or NULL
- *
- * Restore LLDP Agent factory settings if @restore set to True. In other case
- * only returns factory setting in AQ response.
- **/
-int
-i40e_aq_restore_lldp(struct i40e_hw *hw, u8 *setting, bool restore,
-		     struct i40e_asq_cmd_details *cmd_details)
-{
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_lldp_restore *cmd =
-		(struct i40e_aqc_lldp_restore *)&desc.params.raw;
-	int status;
-
-	if (!test_bit(I40E_HW_CAP_FW_LLDP_PERSISTENT, hw->caps)) {
-		i40e_debug(hw, I40E_DEBUG_ALL,
-			   "Restore LLDP not supported by current FW version.\n");
-		return -ENODEV;
-	}
-
-	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_restore);
-
-	if (restore)
-		cmd->command |= I40E_AQ_LLDP_AGENT_RESTORE;
-
-	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
-
-	if (setting)
-		*setting = cmd->command & 1;
-
-	return status;
-}
-
-/**
  * i40e_aq_stop_lldp
  * @hw: pointer to the hw struct
  * @shutdown_agent: True if LLDP Agent needs to be Shutdown
@@ -4570,84 +4336,6 @@ phy_write_end:
 }
 
 /**
- * i40e_write_phy_register
- * @hw: pointer to the HW structure
- * @page: registers page number
- * @reg: register address in the page
- * @phy_addr: PHY address on MDIO interface
- * @value: PHY register value
- *
- * Writes value to specified PHY register
- **/
-int i40e_write_phy_register(struct i40e_hw *hw,
-			    u8 page, u16 reg, u8 phy_addr, u16 value)
-{
-	int status;
-
-	switch (hw->device_id) {
-	case I40E_DEV_ID_1G_BASE_T_X722:
-		status = i40e_write_phy_register_clause22(hw, reg, phy_addr,
-							  value);
-		break;
-	case I40E_DEV_ID_1G_BASE_T_BC:
-	case I40E_DEV_ID_5G_BASE_T_BC:
-	case I40E_DEV_ID_10G_BASE_T:
-	case I40E_DEV_ID_10G_BASE_T4:
-	case I40E_DEV_ID_10G_BASE_T_BC:
-	case I40E_DEV_ID_10G_BASE_T_X722:
-	case I40E_DEV_ID_25G_B:
-	case I40E_DEV_ID_25G_SFP28:
-		status = i40e_write_phy_register_clause45(hw, page, reg,
-							  phy_addr, value);
-		break;
-	default:
-		status = -EIO;
-		break;
-	}
-
-	return status;
-}
-
-/**
- * i40e_read_phy_register
- * @hw: pointer to the HW structure
- * @page: registers page number
- * @reg: register address in the page
- * @phy_addr: PHY address on MDIO interface
- * @value: PHY register value
- *
- * Reads specified PHY register value
- **/
-int i40e_read_phy_register(struct i40e_hw *hw,
-			   u8 page, u16 reg, u8 phy_addr, u16 *value)
-{
-	int status;
-
-	switch (hw->device_id) {
-	case I40E_DEV_ID_1G_BASE_T_X722:
-		status = i40e_read_phy_register_clause22(hw, reg, phy_addr,
-							 value);
-		break;
-	case I40E_DEV_ID_1G_BASE_T_BC:
-	case I40E_DEV_ID_5G_BASE_T_BC:
-	case I40E_DEV_ID_10G_BASE_T:
-	case I40E_DEV_ID_10G_BASE_T4:
-	case I40E_DEV_ID_10G_BASE_T_BC:
-	case I40E_DEV_ID_10G_BASE_T_X722:
-	case I40E_DEV_ID_25G_B:
-	case I40E_DEV_ID_25G_SFP28:
-		status = i40e_read_phy_register_clause45(hw, page, reg,
-							 phy_addr, value);
-		break;
-	default:
-		status = -EIO;
-		break;
-	}
-
-	return status;
-}
-
-/**
  * i40e_get_phy_address
  * @hw: pointer to the HW structure
  * @dev_num: PHY port num that address we want
@@ -4663,80 +4351,6 @@ u8 i40e_get_phy_address(struct i40e_hw *hw, u8 dev_num)
 }
 
 /**
- * i40e_blink_phy_link_led
- * @hw: pointer to the HW structure
- * @time: time how long led will blinks in secs
- * @interval: gap between LED on and off in msecs
- *
- * Blinks PHY link LED
- **/
-int i40e_blink_phy_link_led(struct i40e_hw *hw,
-			    u32 time, u32 interval)
-{
-	u16 led_addr = I40E_PHY_LED_PROV_REG_1;
-	u16 gpio_led_port;
-	u8 phy_addr = 0;
-	int status = 0;
-	u16 led_ctl;
-	u8 port_num;
-	u16 led_reg;
-	u32 i;
-
-	i = rd32(hw, I40E_PFGEN_PORTNUM);
-	port_num = (u8)(i & I40E_PFGEN_PORTNUM_PORT_NUM_MASK);
-	phy_addr = i40e_get_phy_address(hw, port_num);
-
-	for (gpio_led_port = 0; gpio_led_port < 3; gpio_led_port++,
-	     led_addr++) {
-		status = i40e_read_phy_register_clause45(hw,
-							 I40E_PHY_COM_REG_PAGE,
-							 led_addr, phy_addr,
-							 &led_reg);
-		if (status)
-			goto phy_blinking_end;
-		led_ctl = led_reg;
-		if (led_reg & I40E_PHY_LED_LINK_MODE_MASK) {
-			led_reg = 0;
-			status = i40e_write_phy_register_clause45(hw,
-							 I40E_PHY_COM_REG_PAGE,
-							 led_addr, phy_addr,
-							 led_reg);
-			if (status)
-				goto phy_blinking_end;
-			break;
-		}
-	}
-
-	if (time > 0 && interval > 0) {
-		for (i = 0; i < time * 1000; i += interval) {
-			status = i40e_read_phy_register_clause45(hw,
-						I40E_PHY_COM_REG_PAGE,
-						led_addr, phy_addr, &led_reg);
-			if (status)
-				goto restore_config;
-			if (led_reg & I40E_PHY_LED_MANUAL_ON)
-				led_reg = 0;
-			else
-				led_reg = I40E_PHY_LED_MANUAL_ON;
-			status = i40e_write_phy_register_clause45(hw,
-						I40E_PHY_COM_REG_PAGE,
-						led_addr, phy_addr, led_reg);
-			if (status)
-				goto restore_config;
-			msleep(interval);
-		}
-	}
-
-restore_config:
-	status = i40e_write_phy_register_clause45(hw,
-						  I40E_PHY_COM_REG_PAGE,
-						  led_addr, phy_addr, led_ctl);
-
-phy_blinking_end:
-	return status;
-}
-
-/**
  * i40e_led_get_reg - read LED register
  * @hw: pointer to the HW structure
  * @led_addr: LED register address
@@ -5269,39 +4883,6 @@ i40e_find_segment_in_package(u32 segment_type,
 	(struct i40e_profile_section_header *)((u8 *)(profile) + (offset))
 
 /**
- * i40e_find_section_in_profile
- * @section_type: the section type to search for (i.e., SECTION_TYPE_NOTE)
- * @profile: pointer to the i40e segment header to be searched
- *
- * This function searches i40e segment for a particular section type. On
- * success it returns a pointer to the section header, otherwise it will
- * return NULL.
- **/
-struct i40e_profile_section_header *
-i40e_find_section_in_profile(u32 section_type,
-			     struct i40e_profile_segment *profile)
-{
-	struct i40e_profile_section_header *sec;
-	struct i40e_section_table *sec_tbl;
-	u32 sec_off;
-	u32 i;
-
-	if (profile->header.type != SEGMENT_TYPE_I40E)
-		return NULL;
-
-	I40E_SECTION_TABLE(profile, sec_tbl);
-
-	for (i = 0; i < sec_tbl->section_count; i++) {
-		sec_off = sec_tbl->section_offset[i];
-		sec = I40E_SECTION_HEADER(profile, sec_off);
-		if (sec->section.type == section_type)
-			return sec;
-	}
-
-	return NULL;
-}
-
-/**
  * i40e_ddp_exec_aq_section - Execute generic AQ for DDP
  * @hw: pointer to the hw struct
  * @aq: command buffer containing all data to execute AQ
@@ -5524,45 +5105,6 @@ i40e_rollback_profile(struct i40e_hw *hw, struct i40e_profile_segment *profile,
 }
 
 /**
- * i40e_add_pinfo_to_list
- * @hw: pointer to the hardware structure
- * @profile: pointer to the profile segment of the package
- * @profile_info_sec: buffer for information section
- * @track_id: package tracking id
- *
- * Register a profile to the list of loaded profiles.
- */
-int
-i40e_add_pinfo_to_list(struct i40e_hw *hw,
-		       struct i40e_profile_segment *profile,
-		       u8 *profile_info_sec, u32 track_id)
-{
-	struct i40e_profile_section_header *sec = NULL;
-	struct i40e_profile_info *pinfo;
-	u32 offset = 0, info = 0;
-	int status = 0;
-
-	sec = (struct i40e_profile_section_header *)profile_info_sec;
-	sec->tbl_size = 1;
-	sec->data_end = sizeof(struct i40e_profile_section_header) +
-			sizeof(struct i40e_profile_info);
-	sec->section.type = SECTION_TYPE_INFO;
-	sec->section.offset = sizeof(struct i40e_profile_section_header);
-	sec->section.size = sizeof(struct i40e_profile_info);
-	pinfo = (struct i40e_profile_info *)(profile_info_sec +
-					     sec->section.offset);
-	pinfo->track_id = track_id;
-	pinfo->version = profile->version;
-	pinfo->op = I40E_DDP_ADD_TRACKID;
-	memcpy(pinfo->name, profile->name, I40E_DDP_NAME_SIZE);
-
-	status = i40e_aq_write_ddp(hw, (void *)sec, sec->data_end,
-				   track_id, &offset, &info, NULL);
-
-	return status;
-}
-
-/**
  * i40e_aq_add_cloud_filters
  * @hw: pointer to the hardware structure
  * @seid: VSI seid to add cloud filters from
diff --git a/drivers/net/ethernet/intel/i40e/i40e_dcb.c b/drivers/net/ethernet/intel/i40e/i40e_dcb.c
index 8db1eb0c1768..352e957443fd 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_dcb.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_dcb.c
@@ -1491,19 +1491,6 @@ void i40e_dcb_hw_set_num_tc(struct i40e_hw *hw, u8 num_tc)
 }
 
 /**
- * i40e_dcb_hw_get_num_tc
- * @hw: pointer to the hw struct
- *
- * Returns number of traffic classes configured in HW
- **/
-u8 i40e_dcb_hw_get_num_tc(struct i40e_hw *hw)
-{
-	u32 reg = rd32(hw, I40E_PRTDCB_GENC);
-
-	return FIELD_GET(I40E_PRTDCB_GENC_NUMTC_MASK, reg);
-}
-
-/**
  * i40e_dcb_hw_rx_ets_bw_config
  * @hw: pointer to the hw struct
  * @bw_share: Bandwidth share indexed per traffic class
diff --git a/drivers/net/ethernet/intel/i40e/i40e_dcb.h b/drivers/net/ethernet/intel/i40e/i40e_dcb.h
index d76497566e40..d5662c639c41 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_dcb.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_dcb.h
@@ -253,7 +253,6 @@ void i40e_dcb_hw_rx_cmd_monitor_config(struct i40e_hw *hw,
 void i40e_dcb_hw_pfc_config(struct i40e_hw *hw,
 			    u8 pfc_en, u8 *prio_tc);
 void i40e_dcb_hw_set_num_tc(struct i40e_hw *hw, u8 num_tc);
-u8 i40e_dcb_hw_get_num_tc(struct i40e_hw *hw);
 void i40e_dcb_hw_rx_ets_bw_config(struct i40e_hw *hw, u8 *bw_share,
 				  u8 *mode, u8 *prio_type);
 void i40e_dcb_hw_rx_up2tc_config(struct i40e_hw *hw, u8 *prio_tc);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
index 208c2f0857b6..6cd9da662ae1 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
@@ -722,7 +722,7 @@ static void i40e_dbg_dump_vf(struct i40e_pf *pf, int vf_id)
 		dev_info(&pf->pdev->dev, "vf %2d: VSI id=%d, seid=%d, qps=%d\n",
 			 vf_id, vf->lan_vsi_id, vsi->seid, vf->num_queue_pairs);
 		dev_info(&pf->pdev->dev, "       num MDD=%lld\n",
-			 vf->num_mdd_events);
+			 vf->mdd_tx_events.count + vf->mdd_rx_events.count);
 	} else {
 		dev_info(&pf->pdev->dev, "invalid VF id %d\n", vf_id);
 	}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index bce5b76f1e7a..8a7a83f83ee5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -459,6 +459,8 @@ static const struct i40e_priv_flags i40e_gstrings_priv_flags[] = {
 	I40E_PRIV_FLAG("base-r-fec", I40E_FLAG_BASE_R_FEC, 0),
 	I40E_PRIV_FLAG("vf-vlan-pruning",
 		       I40E_FLAG_VF_VLAN_PRUNING_ENA, 0),
+	I40E_PRIV_FLAG("mdd-auto-reset-vf",
+		       I40E_FLAG_MDD_AUTO_RESET_VF, 0),
 };
 
 #define I40E_PRIV_FLAGS_STR_LEN ARRAY_SIZE(i40e_gstrings_priv_flags)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 0e1d9e2fbf38..65a702668e21 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1666,9 +1666,8 @@ struct i40e_mac_filter *i40e_add_filter(struct i40e_vsi *vsi,
  * @vsi: VSI to remove from
  * @f: the filter to remove from the list
  *
- * This function should be called instead of i40e_del_filter only if you know
- * the exact filter you will remove already, such as via i40e_find_filter or
- * i40e_find_mac.
+ * This function requires you've found * the exact filter you will remove
+ * already, such as via i40e_find_filter or i40e_find_mac.
  *
  * NOTE: This function is expected to be called with mac_filter_hash_lock
  * being held.
@@ -1698,29 +1697,6 @@ void __i40e_del_filter(struct i40e_vsi *vsi, struct i40e_mac_filter *f)
 }
 
 /**
- * i40e_del_filter - Remove a MAC/VLAN filter from the VSI
- * @vsi: the VSI to be searched
- * @macaddr: the MAC address
- * @vlan: the VLAN
- *
- * NOTE: This function is expected to be called with mac_filter_hash_lock
- * being held.
- * ANOTHER NOTE: This function MUST be called from within the context of
- * the "safe" variants of any list iterators, e.g. list_for_each_entry_safe()
- * instead of list_for_each_entry().
- **/
-void i40e_del_filter(struct i40e_vsi *vsi, const u8 *macaddr, s16 vlan)
-{
-	struct i40e_mac_filter *f;
-
-	if (!vsi || !macaddr)
-		return;
-
-	f = i40e_find_filter(vsi, macaddr, vlan);
-	__i40e_del_filter(vsi, f);
-}
-
-/**
  * i40e_add_mac_filter - Add a MAC filter for all active VLANs
  * @vsi: the VSI to be searched
  * @macaddr: the mac address to be filtered
@@ -9629,19 +9605,6 @@ static void i40e_handle_lan_overflow_event(struct i40e_pf *pf,
 }
 
 /**
- * i40e_get_cur_guaranteed_fd_count - Get the consumed guaranteed FD filters
- * @pf: board private structure
- **/
-u32 i40e_get_cur_guaranteed_fd_count(struct i40e_pf *pf)
-{
-	u32 val, fcnt_prog;
-
-	val = rd32(&pf->hw, I40E_PFQF_FDSTAT);
-	fcnt_prog = (val & I40E_PFQF_FDSTAT_GUARANT_CNT_MASK);
-	return fcnt_prog;
-}
-
-/**
  * i40e_get_current_fd_count - Get total FD filters programmed for this PF
  * @pf: board private structure
  **/
@@ -11217,6 +11180,67 @@ static void i40e_handle_reset_warning(struct i40e_pf *pf, bool lock_acquired)
 }
 
 /**
+ * i40e_print_vf_mdd_event - print VF Tx/Rx malicious driver detect event
+ * @pf: board private structure
+ * @vf: pointer to the VF structure
+ * @is_tx: true - for Tx event, false - for  Rx
+ */
+static void i40e_print_vf_mdd_event(struct i40e_pf *pf, struct i40e_vf *vf,
+				    bool is_tx)
+{
+	dev_err(&pf->pdev->dev, is_tx ?
+		"%lld Tx Malicious Driver Detection events detected on PF %d VF %d MAC %pm. mdd-auto-reset-vfs=%s\n" :
+		"%lld Rx Malicious Driver Detection events detected on PF %d VF %d MAC %pm. mdd-auto-reset-vfs=%s\n",
+		is_tx ? vf->mdd_tx_events.count : vf->mdd_rx_events.count,
+		pf->hw.pf_id,
+		vf->vf_id,
+		vf->default_lan_addr.addr,
+		str_on_off(test_bit(I40E_FLAG_MDD_AUTO_RESET_VF, pf->flags)));
+}
+
+/**
+ * i40e_print_vfs_mdd_events - print VFs malicious driver detect event
+ * @pf: pointer to the PF structure
+ *
+ * Called from i40e_handle_mdd_event to rate limit and print VFs MDD events.
+ */
+static void i40e_print_vfs_mdd_events(struct i40e_pf *pf)
+{
+	unsigned int i;
+
+	/* check that there are pending MDD events to print */
+	if (!test_and_clear_bit(__I40E_MDD_VF_PRINT_PENDING, pf->state))
+		return;
+
+	if (!__ratelimit(&pf->mdd_message_rate_limit))
+		return;
+
+	for (i = 0; i < pf->num_alloc_vfs; i++) {
+		struct i40e_vf *vf = &pf->vf[i];
+		bool is_printed = false;
+
+		/* only print Rx MDD event message if there are new events */
+		if (vf->mdd_rx_events.count != vf->mdd_rx_events.last_printed) {
+			vf->mdd_rx_events.last_printed = vf->mdd_rx_events.count;
+			i40e_print_vf_mdd_event(pf, vf, false);
+			is_printed = true;
+		}
+
+		/* only print Tx MDD event message if there are new events */
+		if (vf->mdd_tx_events.count != vf->mdd_tx_events.last_printed) {
+			vf->mdd_tx_events.last_printed = vf->mdd_tx_events.count;
+			i40e_print_vf_mdd_event(pf, vf, true);
+			is_printed = true;
+		}
+
+		if (is_printed && !test_bit(I40E_FLAG_MDD_AUTO_RESET_VF, pf->flags))
+			dev_info(&pf->pdev->dev,
+				 "Use PF Control I/F to re-enable the VF #%d\n",
+				 i);
+	}
+}
+
+/**
  * i40e_handle_mdd_event
  * @pf: pointer to the PF structure
  *
@@ -11230,8 +11254,13 @@ static void i40e_handle_mdd_event(struct i40e_pf *pf)
 	u32 reg;
 	int i;
 
-	if (!test_bit(__I40E_MDD_EVENT_PENDING, pf->state))
+	if (!test_and_clear_bit(__I40E_MDD_EVENT_PENDING, pf->state)) {
+		/* Since the VF MDD event logging is rate limited, check if
+		 * there are pending MDD events.
+		 */
+		i40e_print_vfs_mdd_events(pf);
 		return;
+	}
 
 	/* find what triggered the MDD event */
 	reg = rd32(hw, I40E_GL_MDET_TX);
@@ -11275,36 +11304,48 @@ static void i40e_handle_mdd_event(struct i40e_pf *pf)
 
 	/* see if one of the VFs needs its hand slapped */
 	for (i = 0; i < pf->num_alloc_vfs && mdd_detected; i++) {
+		bool is_mdd_on_tx = false;
+		bool is_mdd_on_rx = false;
+
 		vf = &(pf->vf[i]);
 		reg = rd32(hw, I40E_VP_MDET_TX(i));
 		if (reg & I40E_VP_MDET_TX_VALID_MASK) {
+			set_bit(__I40E_MDD_VF_PRINT_PENDING, pf->state);
 			wr32(hw, I40E_VP_MDET_TX(i), 0xFFFF);
-			vf->num_mdd_events++;
-			dev_info(&pf->pdev->dev, "TX driver issue detected on VF %d\n",
-				 i);
-			dev_info(&pf->pdev->dev,
-				 "Use PF Control I/F to re-enable the VF\n");
+			vf->mdd_tx_events.count++;
 			set_bit(I40E_VF_STATE_DISABLED, &vf->vf_states);
+			is_mdd_on_tx = true;
 		}
 
 		reg = rd32(hw, I40E_VP_MDET_RX(i));
 		if (reg & I40E_VP_MDET_RX_VALID_MASK) {
+			set_bit(__I40E_MDD_VF_PRINT_PENDING, pf->state);
 			wr32(hw, I40E_VP_MDET_RX(i), 0xFFFF);
-			vf->num_mdd_events++;
-			dev_info(&pf->pdev->dev, "RX driver issue detected on VF %d\n",
-				 i);
-			dev_info(&pf->pdev->dev,
-				 "Use PF Control I/F to re-enable the VF\n");
+			vf->mdd_rx_events.count++;
 			set_bit(I40E_VF_STATE_DISABLED, &vf->vf_states);
+			is_mdd_on_rx = true;
+		}
+
+		if ((is_mdd_on_tx || is_mdd_on_rx) &&
+		    test_bit(I40E_FLAG_MDD_AUTO_RESET_VF, pf->flags)) {
+			/* VF MDD event counters will be cleared by
+			 * reset, so print the event prior to reset.
+			 */
+			if (is_mdd_on_rx)
+				i40e_print_vf_mdd_event(pf, vf, false);
+			if (is_mdd_on_tx)
+				i40e_print_vf_mdd_event(pf, vf, true);
+
+			i40e_vc_reset_vf(vf, true);
 		}
 	}
 
-	/* re-enable mdd interrupt cause */
-	clear_bit(__I40E_MDD_EVENT_PENDING, pf->state);
 	reg = rd32(hw, I40E_PFINT_ICR0_ENA);
 	reg |=  I40E_PFINT_ICR0_ENA_MAL_DETECT_MASK;
 	wr32(hw, I40E_PFINT_ICR0_ENA, reg);
 	i40e_flush(hw);
+
+	i40e_print_vfs_mdd_events(pf);
 }
 
 /**
@@ -12614,89 +12655,6 @@ int i40e_set_partition_bw_setting(struct i40e_pf *pf)
 }
 
 /**
- * i40e_commit_partition_bw_setting - Commit BW settings for this PF partition
- * @pf: board private structure
- **/
-int i40e_commit_partition_bw_setting(struct i40e_pf *pf)
-{
-	/* Commit temporary BW setting to permanent NVM image */
-	enum i40e_admin_queue_err last_aq_status;
-	u16 nvm_word;
-	int ret;
-
-	if (pf->hw.partition_id != 1) {
-		dev_info(&pf->pdev->dev,
-			 "Commit BW only works on partition 1! This is partition %d",
-			 pf->hw.partition_id);
-		ret = -EOPNOTSUPP;
-		goto bw_commit_out;
-	}
-
-	/* Acquire NVM for read access */
-	ret = i40e_acquire_nvm(&pf->hw, I40E_RESOURCE_READ);
-	last_aq_status = pf->hw.aq.asq_last_status;
-	if (ret) {
-		dev_info(&pf->pdev->dev,
-			 "Cannot acquire NVM for read access, err %pe aq_err %s\n",
-			 ERR_PTR(ret),
-			 i40e_aq_str(&pf->hw, last_aq_status));
-		goto bw_commit_out;
-	}
-
-	/* Read word 0x10 of NVM - SW compatibility word 1 */
-	ret = i40e_aq_read_nvm(&pf->hw,
-			       I40E_SR_NVM_CONTROL_WORD,
-			       0x10, sizeof(nvm_word), &nvm_word,
-			       false, NULL);
-	/* Save off last admin queue command status before releasing
-	 * the NVM
-	 */
-	last_aq_status = pf->hw.aq.asq_last_status;
-	i40e_release_nvm(&pf->hw);
-	if (ret) {
-		dev_info(&pf->pdev->dev, "NVM read error, err %pe aq_err %s\n",
-			 ERR_PTR(ret),
-			 i40e_aq_str(&pf->hw, last_aq_status));
-		goto bw_commit_out;
-	}
-
-	/* Wait a bit for NVM release to complete */
-	msleep(50);
-
-	/* Acquire NVM for write access */
-	ret = i40e_acquire_nvm(&pf->hw, I40E_RESOURCE_WRITE);
-	last_aq_status = pf->hw.aq.asq_last_status;
-	if (ret) {
-		dev_info(&pf->pdev->dev,
-			 "Cannot acquire NVM for write access, err %pe aq_err %s\n",
-			 ERR_PTR(ret),
-			 i40e_aq_str(&pf->hw, last_aq_status));
-		goto bw_commit_out;
-	}
-	/* Write it back out unchanged to initiate update NVM,
-	 * which will force a write of the shadow (alt) RAM to
-	 * the NVM - thus storing the bandwidth values permanently.
-	 */
-	ret = i40e_aq_update_nvm(&pf->hw,
-				 I40E_SR_NVM_CONTROL_WORD,
-				 0x10, sizeof(nvm_word),
-				 &nvm_word, true, 0, NULL);
-	/* Save off last admin queue command status before releasing
-	 * the NVM
-	 */
-	last_aq_status = pf->hw.aq.asq_last_status;
-	i40e_release_nvm(&pf->hw);
-	if (ret)
-		dev_info(&pf->pdev->dev,
-			 "BW settings NOT SAVED, err %pe aq_err %s\n",
-			 ERR_PTR(ret),
-			 i40e_aq_str(&pf->hw, last_aq_status));
-bw_commit_out:
-
-	return ret;
-}
-
-/**
  * i40e_is_total_port_shutdown_enabled - read NVM and return value
  * if total port shutdown feature is enabled for this PF
  * @pf: board private structure
@@ -15998,6 +15956,9 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 			 ERR_PTR(err),
 			 i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
 
+	/* VF MDD event logs are rate limited to one second intervals */
+	ratelimit_state_init(&pf->mdd_message_rate_limit, 1 * HZ, 1);
+
 	/* Reconfigure hardware for allowing smaller MSS in the case
 	 * of TSO, so that we avoid the MDD being fired and causing
 	 * a reset in the case of small MSS+TSO.
diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
index 5a0699ca7ce5..099bb8ab7d70 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
@@ -27,13 +27,6 @@ i40e_asq_send_command(struct i40e_hw *hw, struct i40e_aq_desc *desc,
 		      void *buff, /* can be NULL */ u16  buff_size,
 		      struct i40e_asq_cmd_details *cmd_details);
 int
-i40e_asq_send_command_v2(struct i40e_hw *hw,
-			 struct i40e_aq_desc *desc,
-			 void *buff, /* can be NULL */
-			 u16  buff_size,
-			 struct i40e_asq_cmd_details *cmd_details,
-			 enum i40e_admin_queue_err *aq_status);
-int
 i40e_asq_send_command_atomic(struct i40e_hw *hw, struct i40e_aq_desc *desc,
 			     void *buff, /* can be NULL */ u16  buff_size,
 			     struct i40e_asq_cmd_details *cmd_details,
@@ -72,8 +65,6 @@ int i40e_led_set_phy(struct i40e_hw *hw, bool on,
 		     u16 led_addr, u32 mode);
 int i40e_led_get_phy(struct i40e_hw *hw, u16 *led_addr,
 		     u16 *val);
-int i40e_blink_phy_link_led(struct i40e_hw *hw,
-			    u32 time, u32 interval);
 
 /* admin send queue commands */
 
@@ -141,9 +132,6 @@ int i40e_aq_set_vsi_uc_promisc_on_vlan(struct i40e_hw *hw,
 int i40e_aq_set_vsi_bc_promisc_on_vlan(struct i40e_hw *hw,
 				       u16 seid, bool enable, u16 vid,
 				       struct i40e_asq_cmd_details *cmd_details);
-int i40e_aq_set_vsi_vlan_promisc(struct i40e_hw *hw,
-				 u16 seid, bool enable,
-				 struct i40e_asq_cmd_details *cmd_details);
 int i40e_aq_get_vsi_params(struct i40e_hw *hw,
 			   struct i40e_vsi_context *vsi_ctx,
 			   struct i40e_asq_cmd_details *cmd_details);
@@ -176,14 +164,6 @@ i40e_aq_remove_macvlan_v2(struct i40e_hw *hw, u16 seid,
 			  struct i40e_aqc_remove_macvlan_element_data *mv_list,
 			  u16 count, struct i40e_asq_cmd_details *cmd_details,
 			  enum i40e_admin_queue_err *aq_status);
-int i40e_aq_add_mirrorrule(struct i40e_hw *hw, u16 sw_seid,
-			   u16 rule_type, u16 dest_vsi, u16 count, __le16 *mr_list,
-			   struct i40e_asq_cmd_details *cmd_details,
-			   u16 *rule_id, u16 *rules_used, u16 *rules_free);
-int i40e_aq_delete_mirrorrule(struct i40e_hw *hw, u16 sw_seid,
-			      u16 rule_type, u16 rule_id, u16 count, __le16 *mr_list,
-			      struct i40e_asq_cmd_details *cmd_details,
-			      u16 *rules_used, u16 *rules_free);
 
 int i40e_aq_send_msg_to_vf(struct i40e_hw *hw, u16 vfid,
 			   u32 v_opcode, u32 v_retval, u8 *msg, u16 msglen,
@@ -220,9 +200,6 @@ int i40e_aq_update_nvm(struct i40e_hw *hw, u8 module_pointer,
 		       u32 offset, u16 length, void *data,
 		       bool last_command, u8 preservation_flags,
 		       struct i40e_asq_cmd_details *cmd_details);
-int i40e_aq_rearrange_nvm(struct i40e_hw *hw,
-			  u8 rearrange_nvm,
-			  struct i40e_asq_cmd_details *cmd_details);
 int i40e_aq_get_lldp_mib(struct i40e_hw *hw, u8 bridge_type,
 			 u8 mib_type, void *buff, u16 buff_size,
 			 u16 *local_len, u16 *remote_len,
@@ -234,9 +211,6 @@ i40e_aq_set_lldp_mib(struct i40e_hw *hw,
 int i40e_aq_cfg_lldp_mib_change_event(struct i40e_hw *hw,
 				      bool enable_update,
 				      struct i40e_asq_cmd_details *cmd_details);
-int
-i40e_aq_restore_lldp(struct i40e_hw *hw, u8 *setting, bool restore,
-		     struct i40e_asq_cmd_details *cmd_details);
 int i40e_aq_stop_lldp(struct i40e_hw *hw, bool shutdown_agent,
 		      bool persist,
 		      struct i40e_asq_cmd_details *cmd_details);
@@ -458,13 +432,7 @@ int i40e_read_phy_register_clause45(struct i40e_hw *hw,
 				    u8 page, u16 reg, u8 phy_addr, u16 *value);
 int i40e_write_phy_register_clause45(struct i40e_hw *hw,
 				     u8 page, u16 reg, u8 phy_addr, u16 value);
-int i40e_read_phy_register(struct i40e_hw *hw, u8 page, u16 reg,
-			   u8 phy_addr, u16 *value);
-int i40e_write_phy_register(struct i40e_hw *hw, u8 page, u16 reg,
-			    u8 phy_addr, u16 value);
 u8 i40e_get_phy_address(struct i40e_hw *hw, u8 dev_num);
-int i40e_blink_phy_link_led(struct i40e_hw *hw,
-			    u32 time, u32 interval);
 int i40e_aq_write_ddp(struct i40e_hw *hw, void *buff,
 		      u16 buff_size, u32 track_id,
 		      u32 *error_offset, u32 *error_info,
@@ -477,20 +445,12 @@ int i40e_aq_get_ddp_list(struct i40e_hw *hw, void *buff,
 struct i40e_generic_seg_header *
 i40e_find_segment_in_package(u32 segment_type,
 			     struct i40e_package_header *pkg_header);
-struct i40e_profile_section_header *
-i40e_find_section_in_profile(u32 section_type,
-			     struct i40e_profile_segment *profile);
 int
 i40e_write_profile(struct i40e_hw *hw, struct i40e_profile_segment *i40e_seg,
 		   u32 track_id);
 int
 i40e_rollback_profile(struct i40e_hw *hw, struct i40e_profile_segment *i40e_seg,
 		      u32 track_id);
-int
-i40e_add_pinfo_to_list(struct i40e_hw *hw,
-		       struct i40e_profile_segment *profile,
-		       u8 *profile_info_sec, u32 track_id);
-
 /* i40e_ddp */
 int i40e_ddp_flash(struct net_device *netdev, struct ethtool_flash *flash);
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index dfa785e39458..1120f8e4bb67 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -216,7 +216,7 @@ void i40e_vc_notify_vf_reset(struct i40e_vf *vf)
  * @notify_vf: notify vf about reset or not
  * Reset VF handler.
  **/
-static void i40e_vc_reset_vf(struct i40e_vf *vf, bool notify_vf)
+void i40e_vc_reset_vf(struct i40e_vf *vf, bool notify_vf)
 {
 	struct i40e_pf *pf = vf->pf;
 	int i;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
index 66f95e2f3146..5cf74f16f433 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
@@ -64,6 +64,12 @@ struct i40evf_channel {
 	u64 max_tx_rate; /* bandwidth rate allocation for VSIs */
 };
 
+struct i40e_mdd_vf_events {
+	u64 count;      /* total count of Rx|Tx events */
+	/* count number of the last printed event */
+	u64 last_printed;
+};
+
 /* VF information structure */
 struct i40e_vf {
 	struct i40e_pf *pf;
@@ -92,7 +98,9 @@ struct i40e_vf {
 
 	u8 num_queue_pairs;	/* num of qps assigned to VF vsis */
 	u8 num_req_queues;	/* num of requested qps */
-	u64 num_mdd_events;	/* num of mdd events detected */
+	/* num of mdd tx and rx events detected */
+	struct i40e_mdd_vf_events mdd_rx_events;
+	struct i40e_mdd_vf_events mdd_tx_events;
 
 	unsigned long vf_caps;	/* vf's adv. capabilities */
 	unsigned long vf_states;	/* vf's runtime states */
@@ -120,6 +128,7 @@ int i40e_alloc_vfs(struct i40e_pf *pf, u16 num_alloc_vfs);
 int i40e_vc_process_vf_msg(struct i40e_pf *pf, s16 vf_id, u32 v_opcode,
 			   u32 v_retval, u8 *msg, u16 msglen);
 int i40e_vc_process_vflr_event(struct i40e_pf *pf);
+void i40e_vc_reset_vf(struct i40e_vf *vf, bool notify_vf);
 bool i40e_reset_vf(struct i40e_vf *vf, bool flr);
 bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr);
 void i40e_vc_notify_vf_reset(struct i40e_vf *vf);
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index a9e54866ae6b..7740f446c73f 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -1968,6 +1968,7 @@ err:
 static void iavf_finish_config(struct work_struct *work)
 {
 	struct iavf_adapter *adapter;
+	bool netdev_released = false;
 	int pairs, err;
 
 	adapter = container_of(work, struct iavf_adapter, finish_config);
@@ -1988,7 +1989,16 @@ static void iavf_finish_config(struct work_struct *work)
 
 	switch (adapter->state) {
 	case __IAVF_DOWN:
+		/* Set the real number of queues when reset occurs while
+		 * state == __IAVF_DOWN
+		 */
+		pairs = adapter->num_active_queues;
+		netif_set_real_num_rx_queues(adapter->netdev, pairs);
+		netif_set_real_num_tx_queues(adapter->netdev, pairs);
+
 		if (adapter->netdev->reg_state != NETREG_REGISTERED) {
+			mutex_unlock(&adapter->netdev->lock);
+			netdev_released = true;
 			err = register_netdevice(adapter->netdev);
 			if (err) {
 				dev_err(&adapter->pdev->dev, "Unable to register netdev (%d)\n",
@@ -2003,11 +2013,7 @@ static void iavf_finish_config(struct work_struct *work)
 				goto out;
 			}
 		}
-
-		/* Set the real number of queues when reset occurs while
-		 * state == __IAVF_DOWN
-		 */
-		fallthrough;
+		break;
 	case __IAVF_RUNNING:
 		pairs = adapter->num_active_queues;
 		netif_set_real_num_rx_queues(adapter->netdev, pairs);
@@ -2020,7 +2026,8 @@ static void iavf_finish_config(struct work_struct *work)
 
 out:
 	mutex_unlock(&adapter->crit_lock);
-	mutex_unlock(&adapter->netdev->lock);
+	if (!netdev_released)
+		mutex_unlock(&adapter->netdev->lock);
 	rtnl_unlock();
 }
 
@@ -2713,12 +2720,16 @@ static void iavf_watchdog_task(struct work_struct *work)
 	struct iavf_adapter *adapter = container_of(work,
 						    struct iavf_adapter,
 						    watchdog_task.work);
+	struct net_device *netdev = adapter->netdev;
 	struct iavf_hw *hw = &adapter->hw;
 	u32 reg_val;
 
+	mutex_lock(&netdev->lock);
 	if (!mutex_trylock(&adapter->crit_lock)) {
-		if (adapter->state == __IAVF_REMOVE)
+		if (adapter->state == __IAVF_REMOVE) {
+			mutex_unlock(&netdev->lock);
 			return;
+		}
 
 		goto restart_watchdog;
 	}
@@ -2730,30 +2741,35 @@ static void iavf_watchdog_task(struct work_struct *work)
 	case __IAVF_STARTUP:
 		iavf_startup(adapter);
 		mutex_unlock(&adapter->crit_lock);
+		mutex_unlock(&netdev->lock);
 		queue_delayed_work(adapter->wq, &adapter->watchdog_task,
 				   msecs_to_jiffies(30));
 		return;
 	case __IAVF_INIT_VERSION_CHECK:
 		iavf_init_version_check(adapter);
 		mutex_unlock(&adapter->crit_lock);
+		mutex_unlock(&netdev->lock);
 		queue_delayed_work(adapter->wq, &adapter->watchdog_task,
 				   msecs_to_jiffies(30));
 		return;
 	case __IAVF_INIT_GET_RESOURCES:
 		iavf_init_get_resources(adapter);
 		mutex_unlock(&adapter->crit_lock);
+		mutex_unlock(&netdev->lock);
 		queue_delayed_work(adapter->wq, &adapter->watchdog_task,
 				   msecs_to_jiffies(1));
 		return;
 	case __IAVF_INIT_EXTENDED_CAPS:
 		iavf_init_process_extended_caps(adapter);
 		mutex_unlock(&adapter->crit_lock);
+		mutex_unlock(&netdev->lock);
 		queue_delayed_work(adapter->wq, &adapter->watchdog_task,
 				   msecs_to_jiffies(1));
 		return;
 	case __IAVF_INIT_CONFIG_ADAPTER:
 		iavf_init_config_adapter(adapter);
 		mutex_unlock(&adapter->crit_lock);
+		mutex_unlock(&netdev->lock);
 		queue_delayed_work(adapter->wq, &adapter->watchdog_task,
 				   msecs_to_jiffies(1));
 		return;
@@ -2765,6 +2781,7 @@ static void iavf_watchdog_task(struct work_struct *work)
 			 * as it can loop forever
 			 */
 			mutex_unlock(&adapter->crit_lock);
+			mutex_unlock(&netdev->lock);
 			return;
 		}
 		if (++adapter->aq_wait_count > IAVF_AQ_MAX_ERR) {
@@ -2773,6 +2790,7 @@ static void iavf_watchdog_task(struct work_struct *work)
 			adapter->flags |= IAVF_FLAG_PF_COMMS_FAILED;
 			iavf_shutdown_adminq(hw);
 			mutex_unlock(&adapter->crit_lock);
+			mutex_unlock(&netdev->lock);
 			queue_delayed_work(adapter->wq,
 					   &adapter->watchdog_task, (5 * HZ));
 			return;
@@ -2780,6 +2798,7 @@ static void iavf_watchdog_task(struct work_struct *work)
 		/* Try again from failed step*/
 		iavf_change_state(adapter, adapter->last_state);
 		mutex_unlock(&adapter->crit_lock);
+		mutex_unlock(&netdev->lock);
 		queue_delayed_work(adapter->wq, &adapter->watchdog_task, HZ);
 		return;
 	case __IAVF_COMM_FAILED:
@@ -2792,6 +2811,7 @@ static void iavf_watchdog_task(struct work_struct *work)
 			iavf_change_state(adapter, __IAVF_INIT_FAILED);
 			adapter->flags &= ~IAVF_FLAG_PF_COMMS_FAILED;
 			mutex_unlock(&adapter->crit_lock);
+			mutex_unlock(&netdev->lock);
 			return;
 		}
 		reg_val = rd32(hw, IAVF_VFGEN_RSTAT) &
@@ -2811,12 +2831,14 @@ static void iavf_watchdog_task(struct work_struct *work)
 		adapter->aq_required = 0;
 		adapter->current_op = VIRTCHNL_OP_UNKNOWN;
 		mutex_unlock(&adapter->crit_lock);
+		mutex_unlock(&netdev->lock);
 		queue_delayed_work(adapter->wq,
 				   &adapter->watchdog_task,
 				   msecs_to_jiffies(10));
 		return;
 	case __IAVF_RESETTING:
 		mutex_unlock(&adapter->crit_lock);
+		mutex_unlock(&netdev->lock);
 		queue_delayed_work(adapter->wq, &adapter->watchdog_task,
 				   HZ * 2);
 		return;
@@ -2847,6 +2869,7 @@ static void iavf_watchdog_task(struct work_struct *work)
 	case __IAVF_REMOVE:
 	default:
 		mutex_unlock(&adapter->crit_lock);
+		mutex_unlock(&netdev->lock);
 		return;
 	}
 
@@ -2858,12 +2881,14 @@ static void iavf_watchdog_task(struct work_struct *work)
 		dev_err(&adapter->pdev->dev, "Hardware reset detected\n");
 		iavf_schedule_reset(adapter, IAVF_FLAG_RESET_PENDING);
 		mutex_unlock(&adapter->crit_lock);
+		mutex_unlock(&netdev->lock);
 		queue_delayed_work(adapter->wq,
 				   &adapter->watchdog_task, HZ * 2);
 		return;
 	}
 
 	mutex_unlock(&adapter->crit_lock);
+	mutex_unlock(&netdev->lock);
 restart_watchdog:
 	if (adapter->state >= __IAVF_DOWN)
 		queue_work(adapter->wq, &adapter->adminq_task);
@@ -4340,14 +4365,17 @@ static int iavf_open(struct net_device *netdev)
 		return -EIO;
 	}
 
+	mutex_lock(&netdev->lock);
 	while (!mutex_trylock(&adapter->crit_lock)) {
 		/* If we are in __IAVF_INIT_CONFIG_ADAPTER state the crit_lock
 		 * is already taken and iavf_open is called from an upper
 		 * device's notifier reacting on NETDEV_REGISTER event.
 		 * We have to leave here to avoid dead lock.
 		 */
-		if (adapter->state == __IAVF_INIT_CONFIG_ADAPTER)
+		if (adapter->state == __IAVF_INIT_CONFIG_ADAPTER) {
+			mutex_unlock(&netdev->lock);
 			return -EBUSY;
+		}
 
 		usleep_range(500, 1000);
 	}
@@ -4396,6 +4424,7 @@ static int iavf_open(struct net_device *netdev)
 	iavf_irq_enable(adapter, true);
 
 	mutex_unlock(&adapter->crit_lock);
+	mutex_unlock(&netdev->lock);
 
 	return 0;
 
@@ -4408,6 +4437,7 @@ err_setup_tx:
 	iavf_free_all_tx_resources(adapter);
 err_unlock:
 	mutex_unlock(&adapter->crit_lock);
+	mutex_unlock(&netdev->lock);
 
 	return err;
 }
@@ -4429,10 +4459,12 @@ static int iavf_close(struct net_device *netdev)
 	u64 aq_to_restore;
 	int status;
 
+	mutex_lock(&netdev->lock);
 	mutex_lock(&adapter->crit_lock);
 
 	if (adapter->state <= __IAVF_DOWN_PENDING) {
 		mutex_unlock(&adapter->crit_lock);
+		mutex_unlock(&netdev->lock);
 		return 0;
 	}
 
@@ -4466,6 +4498,7 @@ static int iavf_close(struct net_device *netdev)
 	iavf_free_traffic_irqs(adapter);
 
 	mutex_unlock(&adapter->crit_lock);
+	mutex_unlock(&netdev->lock);
 
 	/* We explicitly don't free resources here because the hardware is
 	 * still active and can DMA into memory. Resources are cleared in
@@ -5342,6 +5375,7 @@ static int iavf_suspend(struct device *dev_d)
 
 	netif_device_detach(netdev);
 
+	mutex_lock(&netdev->lock);
 	mutex_lock(&adapter->crit_lock);
 
 	if (netif_running(netdev)) {
@@ -5353,6 +5387,7 @@ static int iavf_suspend(struct device *dev_d)
 	iavf_reset_interrupt_capability(adapter);
 
 	mutex_unlock(&adapter->crit_lock);
+	mutex_unlock(&netdev->lock);
 
 	return 0;
 }
@@ -5451,6 +5486,7 @@ static void iavf_remove(struct pci_dev *pdev)
 	if (netdev->reg_state == NETREG_REGISTERED)
 		unregister_netdev(netdev);
 
+	mutex_lock(&netdev->lock);
 	mutex_lock(&adapter->crit_lock);
 	dev_info(&adapter->pdev->dev, "Removing device\n");
 	iavf_change_state(adapter, __IAVF_REMOVE);
@@ -5487,6 +5523,7 @@ static void iavf_remove(struct pci_dev *pdev)
 	mutex_destroy(&hw->aq.asq_mutex);
 	mutex_unlock(&adapter->crit_lock);
 	mutex_destroy(&adapter->crit_lock);
+	mutex_unlock(&netdev->lock);
 
 	iounmap(hw->hw_addr);
 	pci_release_regions(pdev);
diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
index 3bf05b135b35..73f5fddf3ee9 100644
--- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
@@ -2271,6 +2271,8 @@ struct ice_aqc_get_pkg_info_resp {
 	struct ice_aqc_get_pkg_info pkg_info[];
 };
 
+#define ICE_AQC_GET_CGU_MAX_PHASE_ADJ	GENMASK(30, 0)
+
 /* Get CGU abilities command response data structure (indirect 0x0C61) */
 struct ice_aqc_get_cgu_abilities {
 	u8 num_inputs;
diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c
index d5ad6d84007c..38e151c7ea23 100644
--- a/drivers/net/ethernet/intel/ice/ice_dpll.c
+++ b/drivers/net/ethernet/intel/ice/ice_dpll.c
@@ -2065,6 +2065,18 @@ static int ice_dpll_init_worker(struct ice_pf *pf)
 }
 
 /**
+ * ice_dpll_phase_range_set - initialize phase adjust range helper
+ * @range: pointer to phase adjust range struct to be initialized
+ * @phase_adj: a value to be used as min(-)/max(+) boundary
+ */
+static void ice_dpll_phase_range_set(struct dpll_pin_phase_adjust_range *range,
+				     u32 phase_adj)
+{
+	range->min = -phase_adj;
+	range->max = phase_adj;
+}
+
+/**
  * ice_dpll_init_info_pins_generic - initializes generic pins info
  * @pf: board private structure
  * @input: if input pins initialized
@@ -2105,8 +2117,8 @@ static int ice_dpll_init_info_pins_generic(struct ice_pf *pf, bool input)
 	for (i = 0; i < pin_num; i++) {
 		pins[i].idx = i;
 		pins[i].prop.board_label = labels[i];
-		pins[i].prop.phase_range.min = phase_adj_max;
-		pins[i].prop.phase_range.max = -phase_adj_max;
+		ice_dpll_phase_range_set(&pins[i].prop.phase_range,
+					 phase_adj_max);
 		pins[i].prop.capabilities = cap;
 		pins[i].pf = pf;
 		ret = ice_dpll_pin_state_update(pf, &pins[i], pin_type, NULL);
@@ -2152,6 +2164,7 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf,
 	struct ice_hw *hw = &pf->hw;
 	struct ice_dpll_pin *pins;
 	unsigned long caps;
+	u32 phase_adj_max;
 	u8 freq_supp_num;
 	bool input;
 
@@ -2159,11 +2172,13 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf,
 	case ICE_DPLL_PIN_TYPE_INPUT:
 		pins = pf->dplls.inputs;
 		num_pins = pf->dplls.num_inputs;
+		phase_adj_max = pf->dplls.input_phase_adj_max;
 		input = true;
 		break;
 	case ICE_DPLL_PIN_TYPE_OUTPUT:
 		pins = pf->dplls.outputs;
 		num_pins = pf->dplls.num_outputs;
+		phase_adj_max = pf->dplls.output_phase_adj_max;
 		input = false;
 		break;
 	default:
@@ -2188,19 +2203,13 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf,
 				return ret;
 			caps |= (DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE |
 				 DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE);
-			pins[i].prop.phase_range.min =
-				pf->dplls.input_phase_adj_max;
-			pins[i].prop.phase_range.max =
-				-pf->dplls.input_phase_adj_max;
 		} else {
-			pins[i].prop.phase_range.min =
-				pf->dplls.output_phase_adj_max;
-			pins[i].prop.phase_range.max =
-				-pf->dplls.output_phase_adj_max;
 			ret = ice_cgu_get_output_pin_state_caps(hw, i, &caps);
 			if (ret)
 				return ret;
 		}
+		ice_dpll_phase_range_set(&pins[i].prop.phase_range,
+					 phase_adj_max);
 		pins[i].prop.capabilities = caps;
 		ret = ice_dpll_pin_state_update(pf, &pins[i], pin_type, NULL);
 		if (ret)
@@ -2308,8 +2317,10 @@ static int ice_dpll_init_info(struct ice_pf *pf, bool cgu)
 	dp->dpll_idx = abilities.pps_dpll_idx;
 	d->num_inputs = abilities.num_inputs;
 	d->num_outputs = abilities.num_outputs;
-	d->input_phase_adj_max = le32_to_cpu(abilities.max_in_phase_adj);
-	d->output_phase_adj_max = le32_to_cpu(abilities.max_out_phase_adj);
+	d->input_phase_adj_max = le32_to_cpu(abilities.max_in_phase_adj) &
+		ICE_AQC_GET_CGU_MAX_PHASE_ADJ;
+	d->output_phase_adj_max = le32_to_cpu(abilities.max_out_phase_adj) &
+		ICE_AQC_GET_CGU_MAX_PHASE_ADJ;
 
 	alloc_size = sizeof(*d->inputs) * d->num_inputs;
 	d->inputs = kzalloc(alloc_size, GFP_KERNEL);
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_consts.h b/drivers/net/ethernet/intel/ice/ice_ptp_consts.h
index 585ce200c60f..d75f0eddd631 100644
--- a/drivers/net/ethernet/intel/ice/ice_ptp_consts.h
+++ b/drivers/net/ethernet/intel/ice/ice_ptp_consts.h
@@ -761,9 +761,9 @@ const struct ice_vernier_info_e82x e822_vernier[NUM_ICE_PTP_LNK_SPD] = {
 		/* rx_desk_rsgb_par */
 		644531250, /* 644.53125 MHz Reed Solomon gearbox */
 		/* tx_desk_rsgb_pcs */
-		644531250, /* 644.53125 MHz Reed Solomon gearbox */
+		390625000, /* 390.625 MHz Reed Solomon gearbox */
 		/* rx_desk_rsgb_pcs */
-		644531250, /* 644.53125 MHz Reed Solomon gearbox */
+		390625000, /* 390.625 MHz Reed Solomon gearbox */
 		/* tx_fixed_delay */
 		1620,
 		/* pmd_adj_divisor */
diff --git a/drivers/net/ethernet/intel/igb/Makefile b/drivers/net/ethernet/intel/igb/Makefile
index 463c0d26b9d4..6c1b702fd992 100644
--- a/drivers/net/ethernet/intel/igb/Makefile
+++ b/drivers/net/ethernet/intel/igb/Makefile
@@ -8,4 +8,4 @@ obj-$(CONFIG_IGB) += igb.o
 
 igb-y := igb_main.o igb_ethtool.o e1000_82575.o \
 	 e1000_mac.o e1000_nvm.o e1000_phy.o e1000_mbx.o \
-	 e1000_i210.o igb_ptp.o igb_hwmon.o
+	 e1000_i210.o igb_ptp.o igb_hwmon.o igb_xsk.o
diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index 3c2dc7bdebb5..02f340280d20 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -18,8 +18,10 @@
 #include <linux/i2c-algo-bit.h>
 #include <linux/pci.h>
 #include <linux/mdio.h>
+#include <linux/lockdep.h>
 
 #include <net/xdp.h>
+#include <net/xdp_sock_drv.h>
 
 struct igb_adapter;
 
@@ -86,6 +88,7 @@ struct igb_adapter;
 #define IGB_XDP_CONSUMED	BIT(0)
 #define IGB_XDP_TX		BIT(1)
 #define IGB_XDP_REDIR		BIT(2)
+#define IGB_XDP_EXIT		BIT(3)
 
 struct vf_data_storage {
 	unsigned char vf_mac_addresses[ETH_ALEN];
@@ -255,6 +258,7 @@ enum igb_tx_flags {
 enum igb_tx_buf_type {
 	IGB_TYPE_SKB = 0,
 	IGB_TYPE_XDP,
+	IGB_TYPE_XSK
 };
 
 /* wrapper around a pointer to a socket buffer,
@@ -320,6 +324,7 @@ struct igb_ring {
 	union {				/* array of buffer info structs */
 		struct igb_tx_buffer *tx_buffer_info;
 		struct igb_rx_buffer *rx_buffer_info;
+		struct xdp_buff **rx_buffer_info_zc;
 	};
 	void *desc;			/* descriptor ring memory */
 	unsigned long flags;		/* ring specific flags */
@@ -357,6 +362,7 @@ struct igb_ring {
 		};
 	};
 	struct xdp_rxq_info xdp_rxq;
+	struct xsk_buff_pool *xsk_pool;
 } ____cacheline_internodealigned_in_smp;
 
 struct igb_q_vector {
@@ -384,7 +390,8 @@ enum e1000_ring_flags_t {
 	IGB_RING_FLAG_RX_SCTP_CSUM,
 	IGB_RING_FLAG_RX_LB_VLAN_BSWAP,
 	IGB_RING_FLAG_TX_CTX_IDX,
-	IGB_RING_FLAG_TX_DETECT_HANG
+	IGB_RING_FLAG_TX_DETECT_HANG,
+	IGB_RING_FLAG_TX_DISABLED
 };
 
 #define ring_uses_large_buffer(ring) \
@@ -731,12 +738,21 @@ int igb_setup_tx_resources(struct igb_ring *);
 int igb_setup_rx_resources(struct igb_ring *);
 void igb_free_tx_resources(struct igb_ring *);
 void igb_free_rx_resources(struct igb_ring *);
+void igb_clean_tx_ring(struct igb_ring *tx_ring);
+void igb_clean_rx_ring(struct igb_ring *rx_ring);
 void igb_configure_tx_ring(struct igb_adapter *, struct igb_ring *);
 void igb_configure_rx_ring(struct igb_adapter *, struct igb_ring *);
+void igb_finalize_xdp(struct igb_adapter *adapter, unsigned int status);
+void igb_update_rx_stats(struct igb_q_vector *q_vector, unsigned int packets,
+			 unsigned int bytes);
 void igb_setup_tctl(struct igb_adapter *);
 void igb_setup_rctl(struct igb_adapter *);
 void igb_setup_srrctl(struct igb_adapter *, struct igb_ring *);
 netdev_tx_t igb_xmit_frame_ring(struct sk_buff *, struct igb_ring *);
+int igb_xdp_xmit_back(struct igb_adapter *adapter, struct xdp_buff *xdp);
+void igb_process_skb_fields(struct igb_ring *rx_ring,
+			    union e1000_adv_rx_desc *rx_desc,
+			    struct sk_buff *skb);
 void igb_alloc_rx_buffers(struct igb_ring *, u16);
 void igb_update_stats(struct igb_adapter *);
 bool igb_has_link(struct igb_adapter *adapter);
@@ -797,6 +813,33 @@ static inline struct netdev_queue *txring_txq(const struct igb_ring *tx_ring)
 	return netdev_get_tx_queue(tx_ring->netdev, tx_ring->queue_index);
 }
 
+/* This function assumes __netif_tx_lock is held by the caller. */
+static inline void igb_xdp_ring_update_tail(struct igb_ring *ring)
+{
+	lockdep_assert_held(&txring_txq(ring)->_xmit_lock);
+
+	/* Force memory writes to complete before letting h/w know there
+	 * are new descriptors to fetch.
+	 */
+	wmb();
+	writel(ring->next_to_use, ring->tail);
+}
+
+static inline struct igb_ring *igb_xdp_tx_queue_mapping(struct igb_adapter *adapter)
+{
+	unsigned int r_idx = smp_processor_id();
+
+	if (r_idx >= adapter->num_tx_queues)
+		r_idx = r_idx % adapter->num_tx_queues;
+
+	return adapter->tx_ring[r_idx];
+}
+
+static inline bool igb_xdp_is_enabled(struct igb_adapter *adapter)
+{
+	return !!READ_ONCE(adapter->xdp_prog);
+}
+
 int igb_add_filter(struct igb_adapter *adapter,
 		   struct igb_nfc_filter *input);
 int igb_erase_filter(struct igb_adapter *adapter,
@@ -807,4 +850,17 @@ int igb_add_mac_steering_filter(struct igb_adapter *adapter,
 int igb_del_mac_steering_filter(struct igb_adapter *adapter,
 				const u8 *addr, u8 queue, u8 flags);
 
+struct xsk_buff_pool *igb_xsk_pool(struct igb_adapter *adapter,
+				   struct igb_ring *ring);
+int igb_xsk_pool_setup(struct igb_adapter *adapter,
+		       struct xsk_buff_pool *pool,
+		       u16 qid);
+bool igb_alloc_rx_buffers_zc(struct igb_ring *rx_ring,
+			     struct xsk_buff_pool *xsk_pool, u16 count);
+void igb_clean_rx_ring_zc(struct igb_ring *rx_ring);
+int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector,
+			struct xsk_buff_pool *xsk_pool, const int budget);
+bool igb_xmit_zc(struct igb_ring *tx_ring, struct xsk_buff_pool *xsk_pool);
+int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags);
+
 #endif /* _IGB_H_ */
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 288a4bb2683a..d368b753a467 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -33,7 +33,6 @@
 #include <linux/bpf_trace.h>
 #include <linux/pm_runtime.h>
 #include <linux/etherdevice.h>
-#include <linux/lockdep.h>
 #ifdef CONFIG_IGB_DCA
 #include <linux/dca.h>
 #endif
@@ -116,8 +115,6 @@ static void igb_configure_tx(struct igb_adapter *);
 static void igb_configure_rx(struct igb_adapter *);
 static void igb_clean_all_tx_rings(struct igb_adapter *);
 static void igb_clean_all_rx_rings(struct igb_adapter *);
-static void igb_clean_tx_ring(struct igb_ring *);
-static void igb_clean_rx_ring(struct igb_ring *);
 static void igb_set_rx_mode(struct net_device *);
 static void igb_update_phy_info(struct timer_list *);
 static void igb_watchdog(struct timer_list *);
@@ -475,12 +472,17 @@ rx_ring_summary:
 
 		for (i = 0; i < rx_ring->count; i++) {
 			const char *next_desc;
-			struct igb_rx_buffer *buffer_info;
-			buffer_info = &rx_ring->rx_buffer_info[i];
+			dma_addr_t dma = (dma_addr_t)0;
+			struct igb_rx_buffer *buffer_info = NULL;
 			rx_desc = IGB_RX_DESC(rx_ring, i);
 			u0 = (struct my_u0 *)rx_desc;
 			staterr = le32_to_cpu(rx_desc->wb.upper.status_error);
 
+			if (!rx_ring->xsk_pool) {
+				buffer_info = &rx_ring->rx_buffer_info[i];
+				dma = buffer_info->dma;
+			}
+
 			if (i == rx_ring->next_to_use)
 				next_desc = " NTU";
 			else if (i == rx_ring->next_to_clean)
@@ -500,11 +502,11 @@ rx_ring_summary:
 					"R  ", i,
 					le64_to_cpu(u0->a),
 					le64_to_cpu(u0->b),
-					(u64)buffer_info->dma,
+					(u64)dma,
 					next_desc);
 
 				if (netif_msg_pktdata(adapter) &&
-				    buffer_info->dma && buffer_info->page) {
+				    buffer_info && dma && buffer_info->page) {
 					print_hex_dump(KERN_INFO, "",
 					  DUMP_PREFIX_ADDRESS,
 					  16, 1,
@@ -1990,7 +1992,11 @@ static void igb_configure(struct igb_adapter *adapter)
 	 */
 	for (i = 0; i < adapter->num_rx_queues; i++) {
 		struct igb_ring *ring = adapter->rx_ring[i];
-		igb_alloc_rx_buffers(ring, igb_desc_unused(ring));
+		if (ring->xsk_pool)
+			igb_alloc_rx_buffers_zc(ring, ring->xsk_pool,
+						igb_desc_unused(ring));
+		else
+			igb_alloc_rx_buffers(ring, igb_desc_unused(ring));
 	}
 }
 
@@ -2911,37 +2917,20 @@ static int igb_xdp_setup(struct net_device *dev, struct netdev_bpf *bpf)
 
 static int igb_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
+	struct igb_adapter *adapter = netdev_priv(dev);
+
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return igb_xdp_setup(dev, xdp);
+	case XDP_SETUP_XSK_POOL:
+		return igb_xsk_pool_setup(adapter, xdp->xsk.pool,
+					  xdp->xsk.queue_id);
 	default:
 		return -EINVAL;
 	}
 }
 
-/* This function assumes __netif_tx_lock is held by the caller. */
-static void igb_xdp_ring_update_tail(struct igb_ring *ring)
-{
-	lockdep_assert_held(&txring_txq(ring)->_xmit_lock);
-
-	/* Force memory writes to complete before letting h/w know there
-	 * are new descriptors to fetch.
-	 */
-	wmb();
-	writel(ring->next_to_use, ring->tail);
-}
-
-static struct igb_ring *igb_xdp_tx_queue_mapping(struct igb_adapter *adapter)
-{
-	unsigned int r_idx = smp_processor_id();
-
-	if (r_idx >= adapter->num_tx_queues)
-		r_idx = r_idx % adapter->num_tx_queues;
-
-	return adapter->tx_ring[r_idx];
-}
-
-static int igb_xdp_xmit_back(struct igb_adapter *adapter, struct xdp_buff *xdp)
+int igb_xdp_xmit_back(struct igb_adapter *adapter, struct xdp_buff *xdp)
 {
 	struct xdp_frame *xdpf = xdp_convert_buff_to_frame(xdp);
 	int cpu = smp_processor_id();
@@ -2955,7 +2944,8 @@ static int igb_xdp_xmit_back(struct igb_adapter *adapter, struct xdp_buff *xdp)
 	/* During program transitions its possible adapter->xdp_prog is assigned
 	 * but ring has not been configured yet. In this case simply abort xmit.
 	 */
-	tx_ring = adapter->xdp_prog ? igb_xdp_tx_queue_mapping(adapter) : NULL;
+	tx_ring = igb_xdp_is_enabled(adapter) ?
+		igb_xdp_tx_queue_mapping(adapter) : NULL;
 	if (unlikely(!tx_ring))
 		return IGB_XDP_CONSUMED;
 
@@ -2988,10 +2978,14 @@ static int igb_xdp_xmit(struct net_device *dev, int n,
 	/* During program transitions its possible adapter->xdp_prog is assigned
 	 * but ring has not been configured yet. In this case simply abort xmit.
 	 */
-	tx_ring = adapter->xdp_prog ? igb_xdp_tx_queue_mapping(adapter) : NULL;
+	tx_ring = igb_xdp_is_enabled(adapter) ?
+		igb_xdp_tx_queue_mapping(adapter) : NULL;
 	if (unlikely(!tx_ring))
 		return -ENXIO;
 
+	if (unlikely(test_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags)))
+		return -ENXIO;
+
 	nq = txring_txq(tx_ring);
 	__netif_tx_lock(nq, cpu);
 
@@ -3042,6 +3036,7 @@ static const struct net_device_ops igb_netdev_ops = {
 	.ndo_setup_tc		= igb_setup_tc,
 	.ndo_bpf		= igb_xdp,
 	.ndo_xdp_xmit		= igb_xdp_xmit,
+	.ndo_xsk_wakeup         = igb_xsk_wakeup,
 };
 
 /**
@@ -3338,7 +3333,8 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	netdev->priv_flags |= IFF_SUPP_NOFCS;
 
 	netdev->priv_flags |= IFF_UNICAST_FLT;
-	netdev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT;
+	netdev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT |
+			       NETDEV_XDP_ACT_XSK_ZEROCOPY;
 
 	/* MTU range: 68 - 9216 */
 	netdev->min_mtu = ETH_MIN_MTU;
@@ -4364,6 +4360,8 @@ void igb_configure_tx_ring(struct igb_adapter *adapter,
 	u64 tdba = ring->dma;
 	int reg_idx = ring->reg_idx;
 
+	WRITE_ONCE(ring->xsk_pool, igb_xsk_pool(adapter, ring));
+
 	wr32(E1000_TDLEN(reg_idx),
 	     ring->count * sizeof(union e1000_adv_tx_desc));
 	wr32(E1000_TDBAL(reg_idx),
@@ -4424,7 +4422,8 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring)
 	if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
 		xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
 	res = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
-			       rx_ring->queue_index, 0);
+			       rx_ring->queue_index,
+			       rx_ring->q_vector->napi.napi_id);
 	if (res < 0) {
 		dev_err(dev, "Failed to register xdp_rxq index %u\n",
 			rx_ring->queue_index);
@@ -4720,12 +4719,17 @@ void igb_setup_srrctl(struct igb_adapter *adapter, struct igb_ring *ring)
 	struct e1000_hw *hw = &adapter->hw;
 	int reg_idx = ring->reg_idx;
 	u32 srrctl = 0;
+	u32 buf_size;
 
-	srrctl = IGB_RX_HDR_LEN << E1000_SRRCTL_BSIZEHDRSIZE_SHIFT;
-	if (ring_uses_large_buffer(ring))
-		srrctl |= IGB_RXBUFFER_3072 >> E1000_SRRCTL_BSIZEPKT_SHIFT;
+	if (ring->xsk_pool)
+		buf_size = xsk_pool_get_rx_frame_size(ring->xsk_pool);
+	else if (ring_uses_large_buffer(ring))
+		buf_size = IGB_RXBUFFER_3072;
 	else
-		srrctl |= IGB_RXBUFFER_2048 >> E1000_SRRCTL_BSIZEPKT_SHIFT;
+		buf_size = IGB_RXBUFFER_2048;
+
+	srrctl = IGB_RX_HDR_LEN << E1000_SRRCTL_BSIZEHDRSIZE_SHIFT;
+	srrctl |= buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT;
 	srrctl |= E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
 	if (hw->mac.type >= e1000_82580)
 		srrctl |= E1000_SRRCTL_TIMESTAMP;
@@ -4757,8 +4761,17 @@ void igb_configure_rx_ring(struct igb_adapter *adapter,
 	u32 rxdctl = 0;
 
 	xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
-	WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
-					   MEM_TYPE_PAGE_SHARED, NULL));
+	WRITE_ONCE(ring->xsk_pool, igb_xsk_pool(adapter, ring));
+	if (ring->xsk_pool) {
+		WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+						   MEM_TYPE_XSK_BUFF_POOL,
+						   NULL));
+		xsk_pool_set_rxq_info(ring->xsk_pool, &ring->xdp_rxq);
+	} else {
+		WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+						   MEM_TYPE_PAGE_SHARED,
+						   NULL));
+	}
 
 	/* disable the queue */
 	wr32(E1000_RXDCTL(reg_idx), 0);
@@ -4785,9 +4798,12 @@ void igb_configure_rx_ring(struct igb_adapter *adapter,
 	rxdctl |= IGB_RX_HTHRESH << 8;
 	rxdctl |= IGB_RX_WTHRESH << 16;
 
-	/* initialize rx_buffer_info */
-	memset(ring->rx_buffer_info, 0,
-	       sizeof(struct igb_rx_buffer) * ring->count);
+	if (ring->xsk_pool)
+		memset(ring->rx_buffer_info_zc, 0,
+		       sizeof(*ring->rx_buffer_info_zc) * ring->count);
+	else
+		memset(ring->rx_buffer_info, 0,
+		       sizeof(*ring->rx_buffer_info) * ring->count);
 
 	/* initialize Rx descriptor 0 */
 	rx_desc = IGB_RX_DESC(ring, 0);
@@ -4888,19 +4904,24 @@ static void igb_free_all_tx_resources(struct igb_adapter *adapter)
  *  igb_clean_tx_ring - Free Tx Buffers
  *  @tx_ring: ring to be cleaned
  **/
-static void igb_clean_tx_ring(struct igb_ring *tx_ring)
+void igb_clean_tx_ring(struct igb_ring *tx_ring)
 {
 	u16 i = tx_ring->next_to_clean;
 	struct igb_tx_buffer *tx_buffer = &tx_ring->tx_buffer_info[i];
+	u32 xsk_frames = 0;
 
 	while (i != tx_ring->next_to_use) {
 		union e1000_adv_tx_desc *eop_desc, *tx_desc;
 
 		/* Free all the Tx ring sk_buffs or xdp frames */
-		if (tx_buffer->type == IGB_TYPE_SKB)
+		if (tx_buffer->type == IGB_TYPE_SKB) {
 			dev_kfree_skb_any(tx_buffer->skb);
-		else
+		} else if (tx_buffer->type == IGB_TYPE_XDP) {
 			xdp_return_frame(tx_buffer->xdpf);
+		} else if (tx_buffer->type == IGB_TYPE_XSK) {
+			xsk_frames++;
+			goto skip_for_xsk;
+		}
 
 		/* unmap skb header data */
 		dma_unmap_single(tx_ring->dev,
@@ -4931,6 +4952,7 @@ static void igb_clean_tx_ring(struct igb_ring *tx_ring)
 					       DMA_TO_DEVICE);
 		}
 
+skip_for_xsk:
 		tx_buffer->next_to_watch = NULL;
 
 		/* move us one more past the eop_desc for start of next pkt */
@@ -4945,6 +4967,9 @@ static void igb_clean_tx_ring(struct igb_ring *tx_ring)
 	/* reset BQL for queue */
 	netdev_tx_reset_queue(txring_txq(tx_ring));
 
+	if (tx_ring->xsk_pool && xsk_frames)
+		xsk_tx_completed(tx_ring->xsk_pool, xsk_frames);
+
 	/* reset next_to_use and next_to_clean */
 	tx_ring->next_to_use = 0;
 	tx_ring->next_to_clean = 0;
@@ -4975,8 +5000,13 @@ void igb_free_rx_resources(struct igb_ring *rx_ring)
 
 	rx_ring->xdp_prog = NULL;
 	xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
-	vfree(rx_ring->rx_buffer_info);
-	rx_ring->rx_buffer_info = NULL;
+	if (rx_ring->xsk_pool) {
+		vfree(rx_ring->rx_buffer_info_zc);
+		rx_ring->rx_buffer_info_zc = NULL;
+	} else {
+		vfree(rx_ring->rx_buffer_info);
+		rx_ring->rx_buffer_info = NULL;
+	}
 
 	/* if not set, then don't free */
 	if (!rx_ring->desc)
@@ -5007,13 +5037,18 @@ static void igb_free_all_rx_resources(struct igb_adapter *adapter)
  *  igb_clean_rx_ring - Free Rx Buffers per Queue
  *  @rx_ring: ring to free buffers from
  **/
-static void igb_clean_rx_ring(struct igb_ring *rx_ring)
+void igb_clean_rx_ring(struct igb_ring *rx_ring)
 {
 	u16 i = rx_ring->next_to_clean;
 
 	dev_kfree_skb(rx_ring->skb);
 	rx_ring->skb = NULL;
 
+	if (rx_ring->xsk_pool) {
+		igb_clean_rx_ring_zc(rx_ring);
+		goto skip_for_xsk;
+	}
+
 	/* Free all the Rx ring sk_buffs */
 	while (i != rx_ring->next_to_alloc) {
 		struct igb_rx_buffer *buffer_info = &rx_ring->rx_buffer_info[i];
@@ -5041,6 +5076,7 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring)
 			i = 0;
 	}
 
+skip_for_xsk:
 	rx_ring->next_to_alloc = 0;
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
@@ -6467,6 +6503,9 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 		return NETDEV_TX_BUSY;
 	}
 
+	if (unlikely(test_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags)))
+		return NETDEV_TX_BUSY;
+
 	/* record the location of the first descriptor for this packet */
 	first = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
 	first->type = IGB_TYPE_SKB;
@@ -6622,7 +6661,7 @@ static int igb_change_mtu(struct net_device *netdev, int new_mtu)
 	struct igb_adapter *adapter = netdev_priv(netdev);
 	int max_frame = new_mtu + IGB_ETH_PKT_HDR_PAD;
 
-	if (adapter->xdp_prog) {
+	if (igb_xdp_is_enabled(adapter)) {
 		int i;
 
 		for (i = 0; i < adapter->num_rx_queues; i++) {
@@ -8195,6 +8234,7 @@ static int igb_poll(struct napi_struct *napi, int budget)
 	struct igb_q_vector *q_vector = container_of(napi,
 						     struct igb_q_vector,
 						     napi);
+	struct xsk_buff_pool *xsk_pool;
 	bool clean_complete = true;
 	int work_done = 0;
 
@@ -8206,7 +8246,12 @@ static int igb_poll(struct napi_struct *napi, int budget)
 		clean_complete = igb_clean_tx_irq(q_vector, budget);
 
 	if (q_vector->rx.ring) {
-		int cleaned = igb_clean_rx_irq(q_vector, budget);
+		int cleaned;
+
+		xsk_pool = READ_ONCE(q_vector->rx.ring->xsk_pool);
+		cleaned = xsk_pool ?
+			igb_clean_rx_irq_zc(q_vector, xsk_pool, budget) :
+			igb_clean_rx_irq(q_vector, budget);
 
 		work_done += cleaned;
 		if (cleaned >= budget)
@@ -8235,13 +8280,18 @@ static int igb_poll(struct napi_struct *napi, int budget)
  **/
 static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget)
 {
-	struct igb_adapter *adapter = q_vector->adapter;
-	struct igb_ring *tx_ring = q_vector->tx.ring;
-	struct igb_tx_buffer *tx_buffer;
-	union e1000_adv_tx_desc *tx_desc;
 	unsigned int total_bytes = 0, total_packets = 0;
+	struct igb_adapter *adapter = q_vector->adapter;
 	unsigned int budget = q_vector->tx.work_limit;
+	struct igb_ring *tx_ring = q_vector->tx.ring;
 	unsigned int i = tx_ring->next_to_clean;
+	union e1000_adv_tx_desc *tx_desc;
+	struct igb_tx_buffer *tx_buffer;
+	struct xsk_buff_pool *xsk_pool;
+	int cpu = smp_processor_id();
+	bool xsk_xmit_done = true;
+	struct netdev_queue *nq;
+	u32 xsk_frames = 0;
 
 	if (test_bit(__IGB_DOWN, &adapter->state))
 		return true;
@@ -8272,10 +8322,14 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget)
 		total_packets += tx_buffer->gso_segs;
 
 		/* free the skb */
-		if (tx_buffer->type == IGB_TYPE_SKB)
+		if (tx_buffer->type == IGB_TYPE_SKB) {
 			napi_consume_skb(tx_buffer->skb, napi_budget);
-		else
+		} else if (tx_buffer->type == IGB_TYPE_XDP) {
 			xdp_return_frame(tx_buffer->xdpf);
+		} else if (tx_buffer->type == IGB_TYPE_XSK) {
+			xsk_frames++;
+			goto skip_for_xsk;
+		}
 
 		/* unmap skb header data */
 		dma_unmap_single(tx_ring->dev,
@@ -8307,6 +8361,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget)
 			}
 		}
 
+skip_for_xsk:
 		/* move us one more past the eop_desc for start of next pkt */
 		tx_buffer++;
 		tx_desc++;
@@ -8335,6 +8390,21 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget)
 	q_vector->tx.total_bytes += total_bytes;
 	q_vector->tx.total_packets += total_packets;
 
+	xsk_pool = READ_ONCE(tx_ring->xsk_pool);
+	if (xsk_pool) {
+		if (xsk_frames)
+			xsk_tx_completed(xsk_pool, xsk_frames);
+		if (xsk_uses_need_wakeup(xsk_pool))
+			xsk_set_tx_need_wakeup(xsk_pool);
+
+		nq = txring_txq(tx_ring);
+		__netif_tx_lock(nq, cpu);
+		/* Avoid transmit queue timeout since we share it with the slow path */
+		txq_trans_cond_update(nq);
+		xsk_xmit_done = igb_xmit_zc(tx_ring, xsk_pool);
+		__netif_tx_unlock(nq);
+	}
+
 	if (test_bit(IGB_RING_FLAG_TX_DETECT_HANG, &tx_ring->flags)) {
 		struct e1000_hw *hw = &adapter->hw;
 
@@ -8397,7 +8467,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget)
 		}
 	}
 
-	return !!budget;
+	return !!budget && xsk_xmit_done;
 }
 
 /**
@@ -8588,9 +8658,8 @@ static struct sk_buff *igb_build_skb(struct igb_ring *rx_ring,
 	return skb;
 }
 
-static struct sk_buff *igb_run_xdp(struct igb_adapter *adapter,
-				   struct igb_ring *rx_ring,
-				   struct xdp_buff *xdp)
+static int igb_run_xdp(struct igb_adapter *adapter, struct igb_ring *rx_ring,
+		       struct xdp_buff *xdp)
 {
 	int err, result = IGB_XDP_PASS;
 	struct bpf_prog *xdp_prog;
@@ -8630,7 +8699,7 @@ out_failure:
 		break;
 	}
 xdp_out:
-	return ERR_PTR(-result);
+	return result;
 }
 
 static unsigned int igb_rx_frame_truesize(struct igb_ring *rx_ring,
@@ -8756,10 +8825,6 @@ static bool igb_cleanup_headers(struct igb_ring *rx_ring,
 				union e1000_adv_rx_desc *rx_desc,
 				struct sk_buff *skb)
 {
-	/* XDP packets use error pointer so abort at this point */
-	if (IS_ERR(skb))
-		return true;
-
 	if (unlikely((igb_test_staterr(rx_desc,
 				       E1000_RXDEXT_ERR_FRAME_ERR_MASK)))) {
 		struct net_device *netdev = rx_ring->netdev;
@@ -8786,9 +8851,9 @@ static bool igb_cleanup_headers(struct igb_ring *rx_ring,
  *  order to populate the hash, checksum, VLAN, timestamp, protocol, and
  *  other fields within the skb.
  **/
-static void igb_process_skb_fields(struct igb_ring *rx_ring,
-				   union e1000_adv_rx_desc *rx_desc,
-				   struct sk_buff *skb)
+void igb_process_skb_fields(struct igb_ring *rx_ring,
+			    union e1000_adv_rx_desc *rx_desc,
+			    struct sk_buff *skb)
 {
 	struct net_device *dev = rx_ring->netdev;
 
@@ -8870,6 +8935,38 @@ static void igb_put_rx_buffer(struct igb_ring *rx_ring,
 	rx_buffer->page = NULL;
 }
 
+void igb_finalize_xdp(struct igb_adapter *adapter, unsigned int status)
+{
+	int cpu = smp_processor_id();
+	struct netdev_queue *nq;
+
+	if (status & IGB_XDP_REDIR)
+		xdp_do_flush();
+
+	if (status & IGB_XDP_TX) {
+		struct igb_ring *tx_ring = igb_xdp_tx_queue_mapping(adapter);
+
+		nq = txring_txq(tx_ring);
+		__netif_tx_lock(nq, cpu);
+		igb_xdp_ring_update_tail(tx_ring);
+		__netif_tx_unlock(nq);
+	}
+}
+
+void igb_update_rx_stats(struct igb_q_vector *q_vector, unsigned int packets,
+			 unsigned int bytes)
+{
+	struct igb_ring *ring = q_vector->rx.ring;
+
+	u64_stats_update_begin(&ring->rx_syncp);
+	ring->rx_stats.packets += packets;
+	ring->rx_stats.bytes += bytes;
+	u64_stats_update_end(&ring->rx_syncp);
+
+	q_vector->rx.total_packets += packets;
+	q_vector->rx.total_bytes += bytes;
+}
+
 static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
 {
 	unsigned int total_bytes = 0, total_packets = 0;
@@ -8877,12 +8974,11 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
 	struct igb_ring *rx_ring = q_vector->rx.ring;
 	u16 cleaned_count = igb_desc_unused(rx_ring);
 	struct sk_buff *skb = rx_ring->skb;
-	int cpu = smp_processor_id();
 	unsigned int xdp_xmit = 0;
-	struct netdev_queue *nq;
 	struct xdp_buff xdp;
 	u32 frame_sz = 0;
 	int rx_buf_pgcnt;
+	int xdp_res = 0;
 
 	/* Frame size depend on rx_ring setup when PAGE_SIZE=4K */
 #if (PAGE_SIZE < 8192)
@@ -8940,12 +9036,10 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
 			/* At larger PAGE_SIZE, frame_sz depend on len size */
 			xdp.frame_sz = igb_rx_frame_truesize(rx_ring, size);
 #endif
-			skb = igb_run_xdp(adapter, rx_ring, &xdp);
+			xdp_res = igb_run_xdp(adapter, rx_ring, &xdp);
 		}
 
-		if (IS_ERR(skb)) {
-			unsigned int xdp_res = -PTR_ERR(skb);
-
+		if (xdp_res) {
 			if (xdp_res & (IGB_XDP_TX | IGB_XDP_REDIR)) {
 				xdp_xmit |= xdp_res;
 				igb_rx_buffer_flip(rx_ring, rx_buffer, size);
@@ -8964,7 +9058,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
 						&xdp, timestamp);
 
 		/* exit if we failed to retrieve a buffer */
-		if (!skb) {
+		if (!xdp_res && !skb) {
 			rx_ring->rx_stats.alloc_failed++;
 			rx_buffer->pagecnt_bias++;
 			break;
@@ -8978,7 +9072,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
 			continue;
 
 		/* verify the packet layout is correct */
-		if (igb_cleanup_headers(rx_ring, rx_desc, skb)) {
+		if (xdp_res || igb_cleanup_headers(rx_ring, rx_desc, skb)) {
 			skb = NULL;
 			continue;
 		}
@@ -9001,24 +9095,10 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
 	/* place incomplete frames back on ring for completion */
 	rx_ring->skb = skb;
 
-	if (xdp_xmit & IGB_XDP_REDIR)
-		xdp_do_flush();
-
-	if (xdp_xmit & IGB_XDP_TX) {
-		struct igb_ring *tx_ring = igb_xdp_tx_queue_mapping(adapter);
-
-		nq = txring_txq(tx_ring);
-		__netif_tx_lock(nq, cpu);
-		igb_xdp_ring_update_tail(tx_ring);
-		__netif_tx_unlock(nq);
-	}
+	if (xdp_xmit)
+		igb_finalize_xdp(adapter, xdp_xmit);
 
-	u64_stats_update_begin(&rx_ring->rx_syncp);
-	rx_ring->rx_stats.packets += total_packets;
-	rx_ring->rx_stats.bytes += total_bytes;
-	u64_stats_update_end(&rx_ring->rx_syncp);
-	q_vector->rx.total_packets += total_packets;
-	q_vector->rx.total_bytes += total_bytes;
+	igb_update_rx_stats(q_vector, total_packets, total_bytes);
 
 	if (cleaned_count)
 		igb_alloc_rx_buffers(rx_ring, cleaned_count);
diff --git a/drivers/net/ethernet/intel/igb/igb_xsk.c b/drivers/net/ethernet/intel/igb/igb_xsk.c
new file mode 100644
index 000000000000..157d43787fa0
--- /dev/null
+++ b/drivers/net/ethernet/intel/igb/igb_xsk.c
@@ -0,0 +1,562 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2018 Intel Corporation. */
+
+#include <linux/bpf_trace.h>
+#include <net/xdp_sock_drv.h>
+#include <net/xdp.h>
+
+#include "e1000_hw.h"
+#include "igb.h"
+
+static int igb_realloc_rx_buffer_info(struct igb_ring *ring, bool pool_present)
+{
+	int size = pool_present ?
+		sizeof(*ring->rx_buffer_info_zc) * ring->count :
+		sizeof(*ring->rx_buffer_info) * ring->count;
+	void *buff_info = vmalloc(size);
+
+	if (!buff_info)
+		return -ENOMEM;
+
+	if (pool_present) {
+		vfree(ring->rx_buffer_info);
+		ring->rx_buffer_info = NULL;
+		ring->rx_buffer_info_zc = buff_info;
+	} else {
+		vfree(ring->rx_buffer_info_zc);
+		ring->rx_buffer_info_zc = NULL;
+		ring->rx_buffer_info = buff_info;
+	}
+
+	return 0;
+}
+
+static void igb_txrx_ring_disable(struct igb_adapter *adapter, u16 qid)
+{
+	struct igb_ring *tx_ring = adapter->tx_ring[qid];
+	struct igb_ring *rx_ring = adapter->rx_ring[qid];
+	struct e1000_hw *hw = &adapter->hw;
+
+	set_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags);
+
+	wr32(E1000_TXDCTL(tx_ring->reg_idx), 0);
+	wr32(E1000_RXDCTL(rx_ring->reg_idx), 0);
+
+	synchronize_net();
+
+	/* Rx/Tx share the same napi context. */
+	napi_disable(&rx_ring->q_vector->napi);
+
+	igb_clean_tx_ring(tx_ring);
+	igb_clean_rx_ring(rx_ring);
+
+	memset(&rx_ring->rx_stats, 0, sizeof(rx_ring->rx_stats));
+	memset(&tx_ring->tx_stats, 0, sizeof(tx_ring->tx_stats));
+}
+
+static void igb_txrx_ring_enable(struct igb_adapter *adapter, u16 qid)
+{
+	struct igb_ring *tx_ring = adapter->tx_ring[qid];
+	struct igb_ring *rx_ring = adapter->rx_ring[qid];
+
+	igb_configure_tx_ring(adapter, tx_ring);
+	igb_configure_rx_ring(adapter, rx_ring);
+
+	synchronize_net();
+
+	clear_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags);
+
+	/* call igb_desc_unused which always leaves
+	 * at least 1 descriptor unused to make sure
+	 * next_to_use != next_to_clean
+	 */
+	if (rx_ring->xsk_pool)
+		igb_alloc_rx_buffers_zc(rx_ring, rx_ring->xsk_pool,
+					igb_desc_unused(rx_ring));
+	else
+		igb_alloc_rx_buffers(rx_ring, igb_desc_unused(rx_ring));
+
+	/* Rx/Tx share the same napi context. */
+	napi_enable(&rx_ring->q_vector->napi);
+}
+
+struct xsk_buff_pool *igb_xsk_pool(struct igb_adapter *adapter,
+				   struct igb_ring *ring)
+{
+	int qid = ring->queue_index;
+	struct xsk_buff_pool *pool;
+
+	pool = xsk_get_pool_from_qid(adapter->netdev, qid);
+
+	if (!igb_xdp_is_enabled(adapter))
+		return NULL;
+
+	return (pool && pool->dev) ? pool : NULL;
+}
+
+static int igb_xsk_pool_enable(struct igb_adapter *adapter,
+			       struct xsk_buff_pool *pool,
+			       u16 qid)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct igb_ring *rx_ring;
+	bool if_running;
+	int err;
+
+	if (qid >= adapter->num_rx_queues)
+		return -EINVAL;
+
+	if (qid >= netdev->real_num_rx_queues ||
+	    qid >= netdev->real_num_tx_queues)
+		return -EINVAL;
+
+	err = xsk_pool_dma_map(pool, &adapter->pdev->dev, IGB_RX_DMA_ATTR);
+	if (err)
+		return err;
+
+	rx_ring = adapter->rx_ring[qid];
+	if_running = netif_running(adapter->netdev) && igb_xdp_is_enabled(adapter);
+	if (if_running)
+		igb_txrx_ring_disable(adapter, qid);
+
+	if (if_running) {
+		err = igb_realloc_rx_buffer_info(rx_ring, true);
+		if (!err) {
+			igb_txrx_ring_enable(adapter, qid);
+			/* Kick start the NAPI context so that receiving will start */
+			err = igb_xsk_wakeup(adapter->netdev, qid, XDP_WAKEUP_RX);
+		}
+
+		if (err) {
+			xsk_pool_dma_unmap(pool, IGB_RX_DMA_ATTR);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+static int igb_xsk_pool_disable(struct igb_adapter *adapter, u16 qid)
+{
+	struct xsk_buff_pool *pool;
+	struct igb_ring *rx_ring;
+	bool if_running;
+	int err;
+
+	pool = xsk_get_pool_from_qid(adapter->netdev, qid);
+	if (!pool)
+		return -EINVAL;
+
+	rx_ring = adapter->rx_ring[qid];
+	if_running = netif_running(adapter->netdev) && igb_xdp_is_enabled(adapter);
+	if (if_running)
+		igb_txrx_ring_disable(adapter, qid);
+
+	xsk_pool_dma_unmap(pool, IGB_RX_DMA_ATTR);
+
+	if (if_running) {
+		err = igb_realloc_rx_buffer_info(rx_ring, false);
+		if (err)
+			return err;
+
+		igb_txrx_ring_enable(adapter, qid);
+	}
+
+	return 0;
+}
+
+int igb_xsk_pool_setup(struct igb_adapter *adapter,
+		       struct xsk_buff_pool *pool,
+		       u16 qid)
+{
+	return pool ? igb_xsk_pool_enable(adapter, pool, qid) :
+		igb_xsk_pool_disable(adapter, qid);
+}
+
+static u16 igb_fill_rx_descs(struct xsk_buff_pool *pool, struct xdp_buff **xdp,
+			     union e1000_adv_rx_desc *rx_desc, u16 count)
+{
+	dma_addr_t dma;
+	u16 buffs;
+	int i;
+
+	/* nothing to do */
+	if (!count)
+		return 0;
+
+	buffs = xsk_buff_alloc_batch(pool, xdp, count);
+	for (i = 0; i < buffs; i++) {
+		dma = xsk_buff_xdp_get_dma(*xdp);
+		rx_desc->read.pkt_addr = cpu_to_le64(dma);
+		rx_desc->wb.upper.length = 0;
+
+		rx_desc++;
+		xdp++;
+	}
+
+	return buffs;
+}
+
+bool igb_alloc_rx_buffers_zc(struct igb_ring *rx_ring,
+			     struct xsk_buff_pool *xsk_pool, u16 count)
+{
+	u32 nb_buffs_extra = 0, nb_buffs = 0;
+	union e1000_adv_rx_desc *rx_desc;
+	u16 ntu = rx_ring->next_to_use;
+	u16 total_count = count;
+	struct xdp_buff **xdp;
+
+	rx_desc = IGB_RX_DESC(rx_ring, ntu);
+	xdp = &rx_ring->rx_buffer_info_zc[ntu];
+
+	if (ntu + count >= rx_ring->count) {
+		nb_buffs_extra = igb_fill_rx_descs(xsk_pool, xdp, rx_desc,
+						   rx_ring->count - ntu);
+		if (nb_buffs_extra != rx_ring->count - ntu) {
+			ntu += nb_buffs_extra;
+			goto exit;
+		}
+		rx_desc = IGB_RX_DESC(rx_ring, 0);
+		xdp = rx_ring->rx_buffer_info_zc;
+		ntu = 0;
+		count -= nb_buffs_extra;
+	}
+
+	nb_buffs = igb_fill_rx_descs(xsk_pool, xdp, rx_desc, count);
+	ntu += nb_buffs;
+	if (ntu == rx_ring->count)
+		ntu = 0;
+
+	/* clear the length for the next_to_use descriptor */
+	rx_desc = IGB_RX_DESC(rx_ring, ntu);
+	rx_desc->wb.upper.length = 0;
+
+exit:
+	if (rx_ring->next_to_use != ntu) {
+		rx_ring->next_to_use = ntu;
+
+		/* Force memory writes to complete before letting h/w
+		 * know there are new descriptors to fetch.  (Only
+		 * applicable for weak-ordered memory model archs,
+		 * such as IA-64).
+		 */
+		wmb();
+		writel(ntu, rx_ring->tail);
+	}
+
+	return total_count == (nb_buffs + nb_buffs_extra);
+}
+
+void igb_clean_rx_ring_zc(struct igb_ring *rx_ring)
+{
+	u16 ntc = rx_ring->next_to_clean;
+	u16 ntu = rx_ring->next_to_use;
+
+	while (ntc != ntu) {
+		struct xdp_buff *xdp = rx_ring->rx_buffer_info_zc[ntc];
+
+		xsk_buff_free(xdp);
+		ntc++;
+		if (ntc >= rx_ring->count)
+			ntc = 0;
+	}
+}
+
+static struct sk_buff *igb_construct_skb_zc(struct igb_ring *rx_ring,
+					    struct xdp_buff *xdp,
+					    ktime_t timestamp)
+{
+	unsigned int totalsize = xdp->data_end - xdp->data_meta;
+	unsigned int metasize = xdp->data - xdp->data_meta;
+	struct sk_buff *skb;
+
+	net_prefetch(xdp->data_meta);
+
+	/* allocate a skb to store the frags */
+	skb = napi_alloc_skb(&rx_ring->q_vector->napi, totalsize);
+	if (unlikely(!skb))
+		return NULL;
+
+	if (timestamp)
+		skb_hwtstamps(skb)->hwtstamp = timestamp;
+
+	memcpy(__skb_put(skb, totalsize), xdp->data_meta,
+	       ALIGN(totalsize, sizeof(long)));
+
+	if (metasize) {
+		skb_metadata_set(skb, metasize);
+		__skb_pull(skb, metasize);
+	}
+
+	return skb;
+}
+
+static int igb_run_xdp_zc(struct igb_adapter *adapter, struct igb_ring *rx_ring,
+			  struct xdp_buff *xdp, struct xsk_buff_pool *xsk_pool,
+			  struct bpf_prog *xdp_prog)
+{
+	int err, result = IGB_XDP_PASS;
+	u32 act;
+
+	prefetchw(xdp->data_hard_start); /* xdp_frame write */
+
+	act = bpf_prog_run_xdp(xdp_prog, xdp);
+
+	if (likely(act == XDP_REDIRECT)) {
+		err = xdp_do_redirect(adapter->netdev, xdp, xdp_prog);
+		if (!err)
+			return IGB_XDP_REDIR;
+
+		if (xsk_uses_need_wakeup(xsk_pool) &&
+		    err == -ENOBUFS)
+			result = IGB_XDP_EXIT;
+		else
+			result = IGB_XDP_CONSUMED;
+		goto out_failure;
+	}
+
+	switch (act) {
+	case XDP_PASS:
+		break;
+	case XDP_TX:
+		result = igb_xdp_xmit_back(adapter, xdp);
+		if (result == IGB_XDP_CONSUMED)
+			goto out_failure;
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(adapter->netdev, xdp_prog, act);
+		fallthrough;
+	case XDP_ABORTED:
+out_failure:
+		trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
+		fallthrough;
+	case XDP_DROP:
+		result = IGB_XDP_CONSUMED;
+		break;
+	}
+
+	return result;
+}
+
+int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector,
+			struct xsk_buff_pool *xsk_pool, const int budget)
+{
+	struct igb_adapter *adapter = q_vector->adapter;
+	unsigned int total_bytes = 0, total_packets = 0;
+	struct igb_ring *rx_ring = q_vector->rx.ring;
+	u32 ntc = rx_ring->next_to_clean;
+	struct bpf_prog *xdp_prog;
+	unsigned int xdp_xmit = 0;
+	bool failure = false;
+	u16 entries_to_alloc;
+	struct sk_buff *skb;
+
+	/* xdp_prog cannot be NULL in the ZC path */
+	xdp_prog = READ_ONCE(rx_ring->xdp_prog);
+
+	while (likely(total_packets < budget)) {
+		union e1000_adv_rx_desc *rx_desc;
+		ktime_t timestamp = 0;
+		struct xdp_buff *xdp;
+		unsigned int size;
+		int xdp_res = 0;
+
+		rx_desc = IGB_RX_DESC(rx_ring, ntc);
+		size = le16_to_cpu(rx_desc->wb.upper.length);
+		if (!size)
+			break;
+
+		/* This memory barrier is needed to keep us from reading
+		 * any other fields out of the rx_desc until we know the
+		 * descriptor has been written back
+		 */
+		dma_rmb();
+
+		xdp = rx_ring->rx_buffer_info_zc[ntc];
+		xsk_buff_set_size(xdp, size);
+		xsk_buff_dma_sync_for_cpu(xdp);
+
+		/* pull rx packet timestamp if available and valid */
+		if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) {
+			int ts_hdr_len;
+
+			ts_hdr_len = igb_ptp_rx_pktstamp(rx_ring->q_vector,
+							 xdp->data,
+							 &timestamp);
+
+			xdp->data += ts_hdr_len;
+			xdp->data_meta += ts_hdr_len;
+			size -= ts_hdr_len;
+		}
+
+		xdp_res = igb_run_xdp_zc(adapter, rx_ring, xdp, xsk_pool,
+					 xdp_prog);
+
+		if (xdp_res) {
+			if (likely(xdp_res & (IGB_XDP_TX | IGB_XDP_REDIR))) {
+				xdp_xmit |= xdp_res;
+			} else if (xdp_res == IGB_XDP_EXIT) {
+				failure = true;
+				break;
+			} else if (xdp_res == IGB_XDP_CONSUMED) {
+				xsk_buff_free(xdp);
+			}
+
+			total_packets++;
+			total_bytes += size;
+			ntc++;
+			if (ntc == rx_ring->count)
+				ntc = 0;
+			continue;
+		}
+
+		skb = igb_construct_skb_zc(rx_ring, xdp, timestamp);
+
+		/* exit if we failed to retrieve a buffer */
+		if (!skb) {
+			rx_ring->rx_stats.alloc_failed++;
+			break;
+		}
+
+		xsk_buff_free(xdp);
+		ntc++;
+		if (ntc == rx_ring->count)
+			ntc = 0;
+
+		if (eth_skb_pad(skb))
+			continue;
+
+		/* probably a little skewed due to removing CRC */
+		total_bytes += skb->len;
+
+		/* populate checksum, timestamp, VLAN, and protocol */
+		igb_process_skb_fields(rx_ring, rx_desc, skb);
+
+		napi_gro_receive(&q_vector->napi, skb);
+
+		/* update budget accounting */
+		total_packets++;
+	}
+
+	rx_ring->next_to_clean = ntc;
+
+	if (xdp_xmit)
+		igb_finalize_xdp(adapter, xdp_xmit);
+
+	igb_update_rx_stats(q_vector, total_packets, total_bytes);
+
+	entries_to_alloc = igb_desc_unused(rx_ring);
+	if (entries_to_alloc >= IGB_RX_BUFFER_WRITE)
+		failure |= !igb_alloc_rx_buffers_zc(rx_ring, xsk_pool,
+						    entries_to_alloc);
+
+	if (xsk_uses_need_wakeup(xsk_pool)) {
+		if (failure || rx_ring->next_to_clean == rx_ring->next_to_use)
+			xsk_set_rx_need_wakeup(xsk_pool);
+		else
+			xsk_clear_rx_need_wakeup(xsk_pool);
+
+		return (int)total_packets;
+	}
+	return failure ? budget : (int)total_packets;
+}
+
+bool igb_xmit_zc(struct igb_ring *tx_ring, struct xsk_buff_pool *xsk_pool)
+{
+	unsigned int budget = igb_desc_unused(tx_ring);
+	u32 cmd_type, olinfo_status, nb_pkts, i = 0;
+	struct xdp_desc *descs = xsk_pool->tx_descs;
+	union e1000_adv_tx_desc *tx_desc = NULL;
+	struct igb_tx_buffer *tx_buffer_info;
+	unsigned int total_bytes = 0;
+	dma_addr_t dma;
+
+	if (!netif_carrier_ok(tx_ring->netdev))
+		return true;
+
+	if (test_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags))
+		return true;
+
+	nb_pkts = xsk_tx_peek_release_desc_batch(xsk_pool, budget);
+	if (!nb_pkts)
+		return true;
+
+	while (nb_pkts-- > 0) {
+		dma = xsk_buff_raw_get_dma(xsk_pool, descs[i].addr);
+		xsk_buff_raw_dma_sync_for_device(xsk_pool, dma, descs[i].len);
+
+		tx_buffer_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
+		tx_buffer_info->bytecount = descs[i].len;
+		tx_buffer_info->type = IGB_TYPE_XSK;
+		tx_buffer_info->xdpf = NULL;
+		tx_buffer_info->gso_segs = 1;
+		tx_buffer_info->time_stamp = jiffies;
+
+		tx_desc = IGB_TX_DESC(tx_ring, tx_ring->next_to_use);
+		tx_desc->read.buffer_addr = cpu_to_le64(dma);
+
+		/* put descriptor type bits */
+		cmd_type = E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_DEXT |
+			   E1000_ADVTXD_DCMD_IFCS;
+		olinfo_status = descs[i].len << E1000_ADVTXD_PAYLEN_SHIFT;
+
+		/* FIXME: This sets the Report Status (RS) bit for every
+		 * descriptor. One nice to have optimization would be to set it
+		 * only for the last descriptor in the whole batch. See Intel
+		 * ice driver for an example on how to do it.
+		 */
+		cmd_type |= descs[i].len | IGB_TXD_DCMD;
+		tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type);
+		tx_desc->read.olinfo_status = cpu_to_le32(olinfo_status);
+
+		total_bytes += descs[i].len;
+
+		i++;
+		tx_ring->next_to_use++;
+		tx_buffer_info->next_to_watch = tx_desc;
+		if (tx_ring->next_to_use == tx_ring->count)
+			tx_ring->next_to_use = 0;
+	}
+
+	netdev_tx_sent_queue(txring_txq(tx_ring), total_bytes);
+	igb_xdp_ring_update_tail(tx_ring);
+
+	return nb_pkts < budget;
+}
+
+int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
+{
+	struct igb_adapter *adapter = netdev_priv(dev);
+	struct e1000_hw *hw = &adapter->hw;
+	struct igb_ring *ring;
+	u32 eics = 0;
+
+	if (test_bit(__IGB_DOWN, &adapter->state))
+		return -ENETDOWN;
+
+	if (!igb_xdp_is_enabled(adapter))
+		return -EINVAL;
+
+	if (qid >= adapter->num_tx_queues)
+		return -EINVAL;
+
+	ring = adapter->tx_ring[qid];
+
+	if (test_bit(IGB_RING_FLAG_TX_DISABLED, &ring->flags))
+		return -ENETDOWN;
+
+	if (!READ_ONCE(ring->xsk_pool))
+		return -EINVAL;
+
+	if (!napi_if_scheduled_mark_missed(&ring->q_vector->napi)) {
+		/* Cause software interrupt */
+		if (adapter->flags & IGB_FLAG_HAS_MSIX) {
+			eics |= ring->q_vector->eims_value;
+			wr32(E1000_EICS, eics);
+		} else {
+			wr32(E1000_ICS, E1000_ICS_RXDMT0);
+		}
+	}
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
index eac0f966e0e4..b8111ad9a9a8 100644
--- a/drivers/net/ethernet/intel/igc/igc.h
+++ b/drivers/net/ethernet/intel/igc/igc.h
@@ -337,6 +337,8 @@ struct igc_adapter {
 	struct igc_led_classdev *leds;
 };
 
+void igc_set_queue_napi(struct igc_adapter *adapter, int q_idx,
+			struct napi_struct *napi);
 void igc_up(struct igc_adapter *adapter);
 void igc_down(struct igc_adapter *adapter);
 int igc_open(struct net_device *netdev);
diff --git a/drivers/net/ethernet/intel/igc/igc_base.c b/drivers/net/ethernet/intel/igc/igc_base.c
index 9fae8bdec2a7..1613b562d17c 100644
--- a/drivers/net/ethernet/intel/igc/igc_base.c
+++ b/drivers/net/ethernet/intel/igc/igc_base.c
@@ -68,6 +68,10 @@ static s32 igc_init_nvm_params_base(struct igc_hw *hw)
 	u32 eecd = rd32(IGC_EECD);
 	u16 size;
 
+	/* failed to read reg and got all F's */
+	if (!(~eecd))
+		return -ENXIO;
+
 	size = FIELD_GET(IGC_EECD_SIZE_EX_MASK, eecd);
 
 	/* Added to a constant, "size" becomes the left-shift value
@@ -221,6 +225,8 @@ static s32 igc_get_invariants_base(struct igc_hw *hw)
 
 	/* NVM initialization */
 	ret_val = igc_init_nvm_params_base(hw);
+	if (ret_val)
+		goto out;
 	switch (hw->mac.type) {
 	case igc_i225:
 		ret_val = igc_init_nvm_params_i225(hw);
diff --git a/drivers/net/ethernet/intel/igc/igc_hw.h b/drivers/net/ethernet/intel/igc/igc_hw.h
index d9d1a1a11daf..be8a49a86d09 100644
--- a/drivers/net/ethernet/intel/igc/igc_hw.h
+++ b/drivers/net/ethernet/intel/igc/igc_hw.h
@@ -279,9 +279,4 @@ struct net_device *igc_get_hw_dev(struct igc_hw *hw);
 #define hw_dbg(format, arg...) \
 	netdev_dbg(igc_get_hw_dev(hw), format, ##arg)
 
-s32  igc_read_pcie_cap_reg(struct igc_hw *hw, u32 reg, u16 *value);
-s32  igc_write_pcie_cap_reg(struct igc_hw *hw, u32 reg, u16 *value);
-void igc_read_pci_cfg(struct igc_hw *hw, u32 reg, u16 *value);
-void igc_write_pci_cfg(struct igc_hw *hw, u32 reg, u16 *value);
-
 #endif /* _IGC_HW_H_ */
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index 27872bdea9bd..56a35d58e7a6 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -2123,10 +2123,6 @@ static bool igc_cleanup_headers(struct igc_ring *rx_ring,
 				union igc_adv_rx_desc *rx_desc,
 				struct sk_buff *skb)
 {
-	/* XDP packets use error pointer so abort at this point */
-	if (IS_ERR(skb))
-		return true;
-
 	if (unlikely(igc_test_staterr(rx_desc, IGC_RXDEXT_STATERR_RXE))) {
 		struct net_device *netdev = rx_ring->netdev;
 
@@ -2515,8 +2511,7 @@ out_failure:
 	}
 }
 
-static struct sk_buff *igc_xdp_run_prog(struct igc_adapter *adapter,
-					struct xdp_buff *xdp)
+static int igc_xdp_run_prog(struct igc_adapter *adapter, struct xdp_buff *xdp)
 {
 	struct bpf_prog *prog;
 	int res;
@@ -2530,7 +2525,7 @@ static struct sk_buff *igc_xdp_run_prog(struct igc_adapter *adapter,
 	res = __igc_xdp_run_prog(adapter, prog, xdp);
 
 out:
-	return ERR_PTR(-res);
+	return res;
 }
 
 /* This function assumes __netif_tx_lock is held by the caller. */
@@ -2585,6 +2580,7 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget)
 	struct sk_buff *skb = rx_ring->skb;
 	u16 cleaned_count = igc_desc_unused(rx_ring);
 	int xdp_status = 0, rx_buffer_pgcnt;
+	int xdp_res = 0;
 
 	while (likely(total_packets < budget)) {
 		struct igc_xdp_buff ctx = { .rx_ts = NULL };
@@ -2630,12 +2626,10 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget)
 			xdp_buff_clear_frags_flag(&ctx.xdp);
 			ctx.rx_desc = rx_desc;
 
-			skb = igc_xdp_run_prog(adapter, &ctx.xdp);
+			xdp_res = igc_xdp_run_prog(adapter, &ctx.xdp);
 		}
 
-		if (IS_ERR(skb)) {
-			unsigned int xdp_res = -PTR_ERR(skb);
-
+		if (xdp_res) {
 			switch (xdp_res) {
 			case IGC_XDP_CONSUMED:
 				rx_buffer->pagecnt_bias++;
@@ -2657,7 +2651,7 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget)
 			skb = igc_construct_skb(rx_ring, rx_buffer, &ctx);
 
 		/* exit if we failed to retrieve a buffer */
-		if (!skb) {
+		if (!xdp_res && !skb) {
 			rx_ring->rx_stats.alloc_failed++;
 			rx_buffer->pagecnt_bias++;
 			set_bit(IGC_RING_FLAG_RX_ALLOC_FAILED, &rx_ring->flags);
@@ -2672,7 +2666,7 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget)
 			continue;
 
 		/* verify the packet layout is correct */
-		if (igc_cleanup_headers(rx_ring, rx_desc, skb)) {
+		if (xdp_res || igc_cleanup_headers(rx_ring, rx_desc, skb)) {
 			skb = NULL;
 			continue;
 		}
@@ -4948,6 +4942,22 @@ static int igc_sw_init(struct igc_adapter *adapter)
 	return 0;
 }
 
+void igc_set_queue_napi(struct igc_adapter *adapter, int vector,
+			struct napi_struct *napi)
+{
+	struct igc_q_vector *q_vector = adapter->q_vector[vector];
+
+	if (q_vector->rx.ring)
+		netif_queue_set_napi(adapter->netdev,
+				     q_vector->rx.ring->queue_index,
+				     NETDEV_QUEUE_TYPE_RX, napi);
+
+	if (q_vector->tx.ring)
+		netif_queue_set_napi(adapter->netdev,
+				     q_vector->tx.ring->queue_index,
+				     NETDEV_QUEUE_TYPE_TX, napi);
+}
+
 /**
  * igc_up - Open the interface and prepare it to handle traffic
  * @adapter: board private structure
@@ -4955,6 +4965,7 @@ static int igc_sw_init(struct igc_adapter *adapter)
 void igc_up(struct igc_adapter *adapter)
 {
 	struct igc_hw *hw = &adapter->hw;
+	struct napi_struct *napi;
 	int i = 0;
 
 	/* hardware has been reset, we need to reload some things */
@@ -4962,8 +4973,11 @@ void igc_up(struct igc_adapter *adapter)
 
 	clear_bit(__IGC_DOWN, &adapter->state);
 
-	for (i = 0; i < adapter->num_q_vectors; i++)
-		napi_enable(&adapter->q_vector[i]->napi);
+	for (i = 0; i < adapter->num_q_vectors; i++) {
+		napi = &adapter->q_vector[i]->napi;
+		napi_enable(napi);
+		igc_set_queue_napi(adapter, i, napi);
+	}
 
 	if (adapter->msix_entries)
 		igc_configure_msix(adapter);
@@ -5192,6 +5206,7 @@ void igc_down(struct igc_adapter *adapter)
 	for (i = 0; i < adapter->num_q_vectors; i++) {
 		if (adapter->q_vector[i]) {
 			napi_synchronize(&adapter->q_vector[i]->napi);
+			igc_set_queue_napi(adapter, i, NULL);
 			napi_disable(&adapter->q_vector[i]->napi);
 		}
 	}
@@ -5576,6 +5591,9 @@ static int igc_request_msix(struct igc_adapter *adapter)
 				  q_vector);
 		if (err)
 			goto err_free;
+
+		netif_napi_set_irq(&q_vector->napi,
+				   adapter->msix_entries[vector].vector);
 	}
 
 	igc_configure_msix(adapter);
@@ -6018,6 +6036,7 @@ static int __igc_open(struct net_device *netdev, bool resuming)
 	struct igc_adapter *adapter = netdev_priv(netdev);
 	struct pci_dev *pdev = adapter->pdev;
 	struct igc_hw *hw = &adapter->hw;
+	struct napi_struct *napi;
 	int err = 0;
 	int i = 0;
 
@@ -6053,8 +6072,11 @@ static int __igc_open(struct net_device *netdev, bool resuming)
 
 	clear_bit(__IGC_DOWN, &adapter->state);
 
-	for (i = 0; i < adapter->num_q_vectors; i++)
-		napi_enable(&adapter->q_vector[i]->napi);
+	for (i = 0; i < adapter->num_q_vectors; i++) {
+		napi = &adapter->q_vector[i]->napi;
+		napi_enable(napi);
+		igc_set_queue_napi(adapter, i, napi);
+	}
 
 	/* Clear any pending interrupts. */
 	rd32(IGC_ICR);
@@ -6779,45 +6801,6 @@ static const struct net_device_ops igc_netdev_ops = {
 	.ndo_get_tstamp		= igc_get_tstamp,
 };
 
-/* PCIe configuration access */
-void igc_read_pci_cfg(struct igc_hw *hw, u32 reg, u16 *value)
-{
-	struct igc_adapter *adapter = hw->back;
-
-	pci_read_config_word(adapter->pdev, reg, value);
-}
-
-void igc_write_pci_cfg(struct igc_hw *hw, u32 reg, u16 *value)
-{
-	struct igc_adapter *adapter = hw->back;
-
-	pci_write_config_word(adapter->pdev, reg, *value);
-}
-
-s32 igc_read_pcie_cap_reg(struct igc_hw *hw, u32 reg, u16 *value)
-{
-	struct igc_adapter *adapter = hw->back;
-
-	if (!pci_is_pcie(adapter->pdev))
-		return -IGC_ERR_CONFIG;
-
-	pcie_capability_read_word(adapter->pdev, reg, value);
-
-	return IGC_SUCCESS;
-}
-
-s32 igc_write_pcie_cap_reg(struct igc_hw *hw, u32 reg, u16 *value)
-{
-	struct igc_adapter *adapter = hw->back;
-
-	if (!pci_is_pcie(adapter->pdev))
-		return -IGC_ERR_CONFIG;
-
-	pcie_capability_write_word(adapter->pdev, reg, *value);
-
-	return IGC_SUCCESS;
-}
-
 u32 igc_rd32(struct igc_hw *hw, u32 reg)
 {
 	struct igc_adapter *igc = container_of(hw, struct igc_adapter, hw);
@@ -7338,7 +7321,7 @@ static void igc_deliver_wake_packet(struct net_device *netdev)
 	netif_rx(skb);
 }
 
-static int igc_resume(struct device *dev)
+static int __igc_resume(struct device *dev, bool rpm)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct net_device *netdev = pci_get_drvdata(pdev);
@@ -7381,7 +7364,11 @@ static int igc_resume(struct device *dev)
 	wr32(IGC_WUS, ~0);
 
 	if (netif_running(netdev)) {
+		if (!rpm)
+			rtnl_lock();
 		err = __igc_open(netdev, true);
+		if (!rpm)
+			rtnl_unlock();
 		if (!err)
 			netif_device_attach(netdev);
 	}
@@ -7389,9 +7376,14 @@ static int igc_resume(struct device *dev)
 	return err;
 }
 
+static int igc_resume(struct device *dev)
+{
+	return __igc_resume(dev, false);
+}
+
 static int igc_runtime_resume(struct device *dev)
 {
-	return igc_resume(dev);
+	return __igc_resume(dev, true);
 }
 
 static int igc_suspend(struct device *dev)
@@ -7436,14 +7428,18 @@ static pci_ers_result_t igc_io_error_detected(struct pci_dev *pdev,
 	struct net_device *netdev = pci_get_drvdata(pdev);
 	struct igc_adapter *adapter = netdev_priv(netdev);
 
+	rtnl_lock();
 	netif_device_detach(netdev);
 
-	if (state == pci_channel_io_perm_failure)
+	if (state == pci_channel_io_perm_failure) {
+		rtnl_unlock();
 		return PCI_ERS_RESULT_DISCONNECT;
+	}
 
 	if (netif_running(netdev))
 		igc_down(adapter);
 	pci_disable_device(pdev);
+	rtnl_unlock();
 
 	/* Request a slot reset. */
 	return PCI_ERS_RESULT_NEED_RESET;
@@ -7454,7 +7450,7 @@ static pci_ers_result_t igc_io_error_detected(struct pci_dev *pdev,
  *  @pdev: Pointer to PCI device
  *
  *  Restart the card from scratch, as if from a cold-boot. Implementation
- *  resembles the first-half of the igc_resume routine.
+ *  resembles the first-half of the __igc_resume routine.
  **/
 static pci_ers_result_t igc_io_slot_reset(struct pci_dev *pdev)
 {
@@ -7493,7 +7489,7 @@ static pci_ers_result_t igc_io_slot_reset(struct pci_dev *pdev)
  *
  *  This callback is called when the error recovery driver tells us that
  *  its OK to resume normal operation. Implementation resembles the
- *  second-half of the igc_resume routine.
+ *  second-half of the __igc_resume routine.
  */
 static void igc_io_resume(struct pci_dev *pdev)
 {
diff --git a/drivers/net/ethernet/intel/igc/igc_nvm.c b/drivers/net/ethernet/intel/igc/igc_nvm.c
index 58f81aba0144..efd121c03967 100644
--- a/drivers/net/ethernet/intel/igc/igc_nvm.c
+++ b/drivers/net/ethernet/intel/igc/igc_nvm.c
@@ -36,56 +36,6 @@ static s32 igc_poll_eerd_eewr_done(struct igc_hw *hw, int ee_reg)
 }
 
 /**
- * igc_acquire_nvm - Generic request for access to EEPROM
- * @hw: pointer to the HW structure
- *
- * Set the EEPROM access request bit and wait for EEPROM access grant bit.
- * Return successful if access grant bit set, else clear the request for
- * EEPROM access and return -IGC_ERR_NVM (-1).
- */
-s32 igc_acquire_nvm(struct igc_hw *hw)
-{
-	s32 timeout = IGC_NVM_GRANT_ATTEMPTS;
-	u32 eecd = rd32(IGC_EECD);
-	s32 ret_val = 0;
-
-	wr32(IGC_EECD, eecd | IGC_EECD_REQ);
-	eecd = rd32(IGC_EECD);
-
-	while (timeout) {
-		if (eecd & IGC_EECD_GNT)
-			break;
-		udelay(5);
-		eecd = rd32(IGC_EECD);
-		timeout--;
-	}
-
-	if (!timeout) {
-		eecd &= ~IGC_EECD_REQ;
-		wr32(IGC_EECD, eecd);
-		hw_dbg("Could not acquire NVM grant\n");
-		ret_val = -IGC_ERR_NVM;
-	}
-
-	return ret_val;
-}
-
-/**
- * igc_release_nvm - Release exclusive access to EEPROM
- * @hw: pointer to the HW structure
- *
- * Stop any current commands to the EEPROM and clear the EEPROM request bit.
- */
-void igc_release_nvm(struct igc_hw *hw)
-{
-	u32 eecd;
-
-	eecd = rd32(IGC_EECD);
-	eecd &= ~IGC_EECD_REQ;
-	wr32(IGC_EECD, eecd);
-}
-
-/**
  * igc_read_nvm_eerd - Reads EEPROM using EERD register
  * @hw: pointer to the HW structure
  * @offset: offset of word in the EEPROM to read
diff --git a/drivers/net/ethernet/intel/igc/igc_nvm.h b/drivers/net/ethernet/intel/igc/igc_nvm.h
index f9fc2e9cfb03..ab78d0c64547 100644
--- a/drivers/net/ethernet/intel/igc/igc_nvm.h
+++ b/drivers/net/ethernet/intel/igc/igc_nvm.h
@@ -4,8 +4,6 @@
 #ifndef _IGC_NVM_H_
 #define _IGC_NVM_H_
 
-s32 igc_acquire_nvm(struct igc_hw *hw);
-void igc_release_nvm(struct igc_hw *hw);
 s32 igc_read_mac_addr(struct igc_hw *hw);
 s32 igc_read_nvm_eerd(struct igc_hw *hw, u16 offset, u16 words, u16 *data);
 s32 igc_validate_nvm_checksum(struct igc_hw *hw);
diff --git a/drivers/net/ethernet/intel/igc/igc_xdp.c b/drivers/net/ethernet/intel/igc/igc_xdp.c
index e27af72aada8..13bbd3346e01 100644
--- a/drivers/net/ethernet/intel/igc/igc_xdp.c
+++ b/drivers/net/ethernet/intel/igc/igc_xdp.c
@@ -13,6 +13,7 @@ int igc_xdp_set_prog(struct igc_adapter *adapter, struct bpf_prog *prog,
 	struct net_device *dev = adapter->netdev;
 	bool if_running = netif_running(dev);
 	struct bpf_prog *old_prog;
+	bool need_update;
 
 	if (dev->mtu > ETH_DATA_LEN) {
 		/* For now, the driver doesn't support XDP functionality with
@@ -22,7 +23,8 @@ int igc_xdp_set_prog(struct igc_adapter *adapter, struct bpf_prog *prog,
 		return -EOPNOTSUPP;
 	}
 
-	if (if_running)
+	need_update = !!adapter->xdp_prog != !!prog;
+	if (if_running && need_update)
 		igc_close(dev);
 
 	old_prog = xchg(&adapter->xdp_prog, prog);
@@ -34,7 +36,7 @@ int igc_xdp_set_prog(struct igc_adapter *adapter, struct bpf_prog *prog,
 	else
 		xdp_features_clear_redirect_target(dev);
 
-	if (if_running)
+	if (if_running && need_update)
 		igc_open(dev);
 
 	return 0;
@@ -84,6 +86,7 @@ static int igc_xdp_enable_pool(struct igc_adapter *adapter,
 		napi_disable(napi);
 	}
 
+	igc_set_queue_napi(adapter, queue_id, NULL);
 	set_bit(IGC_RING_FLAG_AF_XDP_ZC, &rx_ring->flags);
 	set_bit(IGC_RING_FLAG_AF_XDP_ZC, &tx_ring->flags);
 
@@ -133,6 +136,7 @@ static int igc_xdp_disable_pool(struct igc_adapter *adapter, u16 queue_id)
 	xsk_pool_dma_unmap(pool, IGC_RX_DMA_ATTR);
 	clear_bit(IGC_RING_FLAG_AF_XDP_ZC, &rx_ring->flags);
 	clear_bit(IGC_RING_FLAG_AF_XDP_ZC, &tx_ring->flags);
+	igc_set_queue_napi(adapter, queue_id, napi);
 
 	if (needs_reset) {
 		napi_enable(napi);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 336e08d35f97..7236f20c9a30 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1923,10 +1923,6 @@ bool ixgbe_cleanup_headers(struct ixgbe_ring *rx_ring,
 {
 	struct net_device *netdev = rx_ring->netdev;
 
-	/* XDP packets use error pointer so abort at this point */
-	if (IS_ERR(skb))
-		return true;
-
 	/* Verify netdev is present, and that packet does not have any
 	 * errors that would be unacceptable to the netdev.
 	 */
@@ -2234,9 +2230,9 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
 	return skb;
 }
 
-static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter,
-				     struct ixgbe_ring *rx_ring,
-				     struct xdp_buff *xdp)
+static int ixgbe_run_xdp(struct ixgbe_adapter *adapter,
+			 struct ixgbe_ring *rx_ring,
+			 struct xdp_buff *xdp)
 {
 	int err, result = IXGBE_XDP_PASS;
 	struct bpf_prog *xdp_prog;
@@ -2286,7 +2282,7 @@ out_failure:
 		break;
 	}
 xdp_out:
-	return ERR_PTR(-result);
+	return result;
 }
 
 static unsigned int ixgbe_rx_frame_truesize(struct ixgbe_ring *rx_ring,
@@ -2344,6 +2340,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 	unsigned int offset = rx_ring->rx_offset;
 	unsigned int xdp_xmit = 0;
 	struct xdp_buff xdp;
+	int xdp_res = 0;
 
 	/* Frame size depend on rx_ring setup when PAGE_SIZE=4K */
 #if (PAGE_SIZE < 8192)
@@ -2389,12 +2386,10 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 			/* At larger PAGE_SIZE, frame_sz depend on len size */
 			xdp.frame_sz = ixgbe_rx_frame_truesize(rx_ring, size);
 #endif
-			skb = ixgbe_run_xdp(adapter, rx_ring, &xdp);
+			xdp_res = ixgbe_run_xdp(adapter, rx_ring, &xdp);
 		}
 
-		if (IS_ERR(skb)) {
-			unsigned int xdp_res = -PTR_ERR(skb);
-
+		if (xdp_res) {
 			if (xdp_res & (IXGBE_XDP_TX | IXGBE_XDP_REDIR)) {
 				xdp_xmit |= xdp_res;
 				ixgbe_rx_buffer_flip(rx_ring, rx_buffer, size);
@@ -2414,7 +2409,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		}
 
 		/* exit if we failed to retrieve a buffer */
-		if (!skb) {
+		if (!xdp_res && !skb) {
 			rx_ring->rx_stats.alloc_rx_buff_failed++;
 			rx_buffer->pagecnt_bias++;
 			break;
@@ -2428,7 +2423,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 			continue;
 
 		/* verify the packet layout is correct */
-		if (ixgbe_cleanup_headers(rx_ring, rx_desc, skb))
+		if (xdp_res || ixgbe_cleanup_headers(rx_ring, rx_desc, skb))
 			continue;
 
 		/* probably a little skewed due to removing CRC */
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
index 9b37f354d78c..4384e892f967 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
@@ -443,7 +443,6 @@ extern const struct ixgbevf_info ixgbevf_X540_vf_hv_info;
 extern const struct ixgbevf_info ixgbevf_X550_vf_hv_info;
 extern const struct ixgbevf_info ixgbevf_X550EM_x_vf_hv_info;
 extern const struct ixgbevf_info ixgbevf_e610_vf_hv_info;
-extern const struct ixgbe_mbx_operations ixgbevf_hv_mbx_ops;
 
 /* needed by ethtool.c */
 extern const char ixgbevf_driver_name[];
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 2829bac9af94..6442f115a262 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -737,10 +737,6 @@ static bool ixgbevf_cleanup_headers(struct ixgbevf_ring *rx_ring,
 				    union ixgbe_adv_rx_desc *rx_desc,
 				    struct sk_buff *skb)
 {
-	/* XDP packets use error pointer so abort at this point */
-	if (IS_ERR(skb))
-		return true;
-
 	/* verify that the packet does not have any known errors */
 	if (unlikely(ixgbevf_test_staterr(rx_desc,
 					  IXGBE_RXDADV_ERR_FRAME_ERR_MASK))) {
@@ -1049,9 +1045,9 @@ static int ixgbevf_xmit_xdp_ring(struct ixgbevf_ring *ring,
 	return IXGBEVF_XDP_TX;
 }
 
-static struct sk_buff *ixgbevf_run_xdp(struct ixgbevf_adapter *adapter,
-				       struct ixgbevf_ring  *rx_ring,
-				       struct xdp_buff *xdp)
+static int ixgbevf_run_xdp(struct ixgbevf_adapter *adapter,
+			   struct ixgbevf_ring *rx_ring,
+			   struct xdp_buff *xdp)
 {
 	int result = IXGBEVF_XDP_PASS;
 	struct ixgbevf_ring *xdp_ring;
@@ -1085,7 +1081,7 @@ out_failure:
 		break;
 	}
 xdp_out:
-	return ERR_PTR(-result);
+	return result;
 }
 
 static unsigned int ixgbevf_rx_frame_truesize(struct ixgbevf_ring *rx_ring,
@@ -1127,6 +1123,7 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 	struct sk_buff *skb = rx_ring->skb;
 	bool xdp_xmit = false;
 	struct xdp_buff xdp;
+	int xdp_res = 0;
 
 	/* Frame size depend on rx_ring setup when PAGE_SIZE=4K */
 #if (PAGE_SIZE < 8192)
@@ -1170,11 +1167,11 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 			/* At larger PAGE_SIZE, frame_sz depend on len size */
 			xdp.frame_sz = ixgbevf_rx_frame_truesize(rx_ring, size);
 #endif
-			skb = ixgbevf_run_xdp(adapter, rx_ring, &xdp);
+			xdp_res = ixgbevf_run_xdp(adapter, rx_ring, &xdp);
 		}
 
-		if (IS_ERR(skb)) {
-			if (PTR_ERR(skb) == -IXGBEVF_XDP_TX) {
+		if (xdp_res) {
+			if (xdp_res == IXGBEVF_XDP_TX) {
 				xdp_xmit = true;
 				ixgbevf_rx_buffer_flip(rx_ring, rx_buffer,
 						       size);
@@ -1194,7 +1191,7 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 		}
 
 		/* exit if we failed to retrieve a buffer */
-		if (!skb) {
+		if (!xdp_res && !skb) {
 			rx_ring->rx_stats.alloc_rx_buff_failed++;
 			rx_buffer->pagecnt_bias++;
 			break;
@@ -1208,7 +1205,7 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 			continue;
 
 		/* verify the packet layout is correct */
-		if (ixgbevf_cleanup_headers(rx_ring, rx_desc, skb)) {
+		if (xdp_res || ixgbevf_cleanup_headers(rx_ring, rx_desc, skb)) {
 			skb = NULL;
 			continue;
 		}
diff --git a/drivers/net/ethernet/intel/ixgbevf/mbx.c b/drivers/net/ethernet/intel/ixgbevf/mbx.c
index a55dd978f7ca..24d0237e7a99 100644
--- a/drivers/net/ethernet/intel/ixgbevf/mbx.c
+++ b/drivers/net/ethernet/intel/ixgbevf/mbx.c
@@ -505,15 +505,3 @@ const struct ixgbe_mbx_operations ixgbevf_mbx_ops_legacy = {
 	.check_for_ack	= ixgbevf_check_for_ack_vf,
 	.check_for_rst	= ixgbevf_check_for_rst_vf,
 };
-
-/* Mailbox operations when running on Hyper-V.
- * On Hyper-V, PF/VF communication is not through the
- * hardware mailbox; this communication is through
- * a software mediated path.
- * Most mail box operations are noop while running on
- * Hyper-V.
- */
-const struct ixgbe_mbx_operations ixgbevf_hv_mbx_ops = {
-	.init_params	= ixgbevf_init_mbx_params_vf,
-	.check_for_rst	= ixgbevf_check_for_rst_vf,
-};
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c
index 6cc7a78968fc..f3b9daffaec3 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c
@@ -133,9 +133,7 @@ static const char *rsrc_name(enum mcs_rsrc_type rsrc_type)
 		return "SA";
 	default:
 		return "Unknown";
-	};
-
-	return "Unknown";
+	}
 }
 
 static int cn10k_mcs_alloc_rsrc(struct otx2_nic *pfvf, enum mcs_direction dir,
diff --git a/drivers/net/ethernet/mediatek/airoha_eth.c b/drivers/net/ethernet/mediatek/airoha_eth.c
index d8bfc21a5b19..a30c417d66f2 100644
--- a/drivers/net/ethernet/mediatek/airoha_eth.c
+++ b/drivers/net/ethernet/mediatek/airoha_eth.c
@@ -15,6 +15,7 @@
 #include <linux/u64_stats_sync.h>
 #include <net/dsa.h>
 #include <net/page_pool/helpers.h>
+#include <net/pkt_cls.h>
 #include <uapi/linux/ppp_defs.h>
 
 #define AIROHA_MAX_NUM_GDM_PORTS	1
@@ -23,8 +24,12 @@
 #define AIROHA_MAX_NUM_XSI_RSTS		5
 #define AIROHA_MAX_MTU			2000
 #define AIROHA_MAX_PACKET_SIZE		2048
+#define AIROHA_NUM_QOS_CHANNELS		4
+#define AIROHA_NUM_QOS_QUEUES		8
 #define AIROHA_NUM_TX_RING		32
 #define AIROHA_NUM_RX_RING		32
+#define AIROHA_NUM_NETDEV_TX_RINGS	(AIROHA_NUM_TX_RING + \
+					 AIROHA_NUM_QOS_CHANNELS)
 #define AIROHA_FE_MC_MAX_VLAN_TABLE	64
 #define AIROHA_FE_MC_MAX_VLAN_PORT	16
 #define AIROHA_NUM_TX_IRQ		2
@@ -40,6 +45,9 @@
 #define PSE_RSV_PAGES			128
 #define PSE_QUEUE_RSV_PAGES		64
 
+#define QDMA_METER_IDX(_n)		((_n) & 0xff)
+#define QDMA_METER_GROUP(_n)		(((_n) >> 8) & 0x3)
+
 /* FE */
 #define PSE_BASE			0x0100
 #define CSR_IFC_BASE			0x0200
@@ -541,9 +549,24 @@
 #define INGRESS_SLOW_TICK_RATIO_MASK	GENMASK(29, 16)
 #define INGRESS_FAST_TICK_MASK		GENMASK(15, 0)
 
+#define REG_QUEUE_CLOSE_CFG(_n)		(0x00a0 + ((_n) & 0xfc))
+#define TXQ_DISABLE_CHAN_QUEUE_MASK(_n, _m)	BIT((_m) + (((_n) & 0x3) << 3))
+
 #define REG_TXQ_DIS_CFG_BASE(_n)	((_n) ? 0x20a0 : 0x00a0)
 #define REG_TXQ_DIS_CFG(_n, _m)		(REG_TXQ_DIS_CFG_BASE((_n)) + (_m) << 2)
 
+#define REG_CNTR_CFG(_n)		(0x0400 + ((_n) << 3))
+#define CNTR_EN_MASK			BIT(31)
+#define CNTR_ALL_CHAN_EN_MASK		BIT(30)
+#define CNTR_ALL_QUEUE_EN_MASK		BIT(29)
+#define CNTR_ALL_DSCP_RING_EN_MASK	BIT(28)
+#define CNTR_SRC_MASK			GENMASK(27, 24)
+#define CNTR_DSCP_RING_MASK		GENMASK(20, 16)
+#define CNTR_CHAN_MASK			GENMASK(7, 3)
+#define CNTR_QUEUE_MASK			GENMASK(2, 0)
+
+#define REG_CNTR_VAL(_n)		(0x0404 + ((_n) << 3))
+
 #define REG_LMGR_INIT_CFG		0x1000
 #define LMGR_INIT_START			BIT(31)
 #define LMGR_SRAM_MODE_MASK		BIT(30)
@@ -565,13 +588,34 @@
 #define EGRESS_SLOW_TICK_RATIO_MASK	GENMASK(29, 16)
 #define EGRESS_FAST_TICK_MASK		GENMASK(15, 0)
 
+#define TRTCM_PARAM_RW_MASK		BIT(31)
+#define TRTCM_PARAM_RW_DONE_MASK	BIT(30)
+#define TRTCM_PARAM_TYPE_MASK		GENMASK(29, 28)
+#define TRTCM_METER_GROUP_MASK		GENMASK(27, 26)
+#define TRTCM_PARAM_INDEX_MASK		GENMASK(23, 17)
+#define TRTCM_PARAM_RATE_TYPE_MASK	BIT(16)
+
+#define REG_TRTCM_CFG_PARAM(_n)		((_n) + 0x4)
+#define REG_TRTCM_DATA_LOW(_n)		((_n) + 0x8)
+#define REG_TRTCM_DATA_HIGH(_n)		((_n) + 0xc)
+
 #define REG_TXWRR_MODE_CFG		0x1020
 #define TWRR_WEIGHT_SCALE_MASK		BIT(31)
 #define TWRR_WEIGHT_BASE_MASK		BIT(3)
 
+#define REG_TXWRR_WEIGHT_CFG		0x1024
+#define TWRR_RW_CMD_MASK		BIT(31)
+#define TWRR_RW_CMD_DONE		BIT(30)
+#define TWRR_CHAN_IDX_MASK		GENMASK(23, 19)
+#define TWRR_QUEUE_IDX_MASK		GENMASK(18, 16)
+#define TWRR_VALUE_MASK			GENMASK(15, 0)
+
 #define REG_PSE_BUF_USAGE_CFG		0x1028
 #define PSE_BUF_ESTIMATE_EN_MASK	BIT(29)
 
+#define REG_CHAN_QOS_MODE(_n)		(0x1040 + ((_n) << 2))
+#define CHAN_QOS_MODE_MASK(_n)		GENMASK(2 + ((_n) << 2), (_n) << 2)
+
 #define REG_GLB_TRTCM_CFG		0x1080
 #define GLB_TRTCM_EN_MASK		BIT(31)
 #define GLB_TRTCM_MODE_MASK		BIT(30)
@@ -720,6 +764,40 @@ enum {
 	FE_PSE_PORT_DROP = 0xf,
 };
 
+enum tx_sched_mode {
+	TC_SCH_WRR8,
+	TC_SCH_SP,
+	TC_SCH_WRR7,
+	TC_SCH_WRR6,
+	TC_SCH_WRR5,
+	TC_SCH_WRR4,
+	TC_SCH_WRR3,
+	TC_SCH_WRR2,
+};
+
+enum trtcm_param_type {
+	TRTCM_MISC_MODE, /* meter_en, pps_mode, tick_sel */
+	TRTCM_TOKEN_RATE_MODE,
+	TRTCM_BUCKETSIZE_SHIFT_MODE,
+	TRTCM_BUCKET_COUNTER_MODE,
+};
+
+enum trtcm_mode_type {
+	TRTCM_COMMIT_MODE,
+	TRTCM_PEAK_MODE,
+};
+
+enum trtcm_param {
+	TRTCM_TICK_SEL = BIT(0),
+	TRTCM_PKT_MODE = BIT(1),
+	TRTCM_METER_MODE = BIT(2),
+};
+
+#define MIN_TOKEN_SIZE				4096
+#define MAX_TOKEN_SIZE_OFFSET			17
+#define TRTCM_TOKEN_RATE_MASK			GENMASK(23, 6)
+#define TRTCM_TOKEN_RATE_FRACTION_MASK		GENMASK(5, 0)
+
 struct airoha_queue_entry {
 	union {
 		void *buf;
@@ -810,6 +888,12 @@ struct airoha_gdm_port {
 	int id;
 
 	struct airoha_hw_stats stats;
+
+	DECLARE_BITMAP(qos_sq_bmap, AIROHA_NUM_QOS_CHANNELS);
+
+	/* qos stats counters */
+	u64 cpu_tx_packets;
+	u64 fwd_tx_packets;
 };
 
 struct airoha_eth {
@@ -1789,6 +1873,10 @@ static int airoha_qdma_init_tx_queue(struct airoha_queue *q,
 		WRITE_ONCE(q->desc[i].ctrl, cpu_to_le32(val));
 	}
 
+	/* xmit ring drop default setting */
+	airoha_qdma_set(qdma, REG_TX_RING_BLOCKING(qid),
+			TX_RING_IRQ_BLOCKING_TX_DROP_EN_MASK);
+
 	airoha_qdma_wr(qdma, REG_TX_RING_BASE(qid), dma_addr);
 	airoha_qdma_rmw(qdma, REG_TX_CPU_IDX(qid), TX_RING_CPU_IDX_MASK,
 			FIELD_PREP(TX_RING_CPU_IDX_MASK, q->head));
@@ -1955,6 +2043,27 @@ static void airoha_qdma_init_qos(struct airoha_qdma *qdma)
 			FIELD_PREP(SLA_SLOW_TICK_RATIO_MASK, 40));
 }
 
+static void airoha_qdma_init_qos_stats(struct airoha_qdma *qdma)
+{
+	int i;
+
+	for (i = 0; i < AIROHA_NUM_QOS_CHANNELS; i++) {
+		/* Tx-cpu transferred count */
+		airoha_qdma_wr(qdma, REG_CNTR_VAL(i << 1), 0);
+		airoha_qdma_wr(qdma, REG_CNTR_CFG(i << 1),
+			       CNTR_EN_MASK | CNTR_ALL_QUEUE_EN_MASK |
+			       CNTR_ALL_DSCP_RING_EN_MASK |
+			       FIELD_PREP(CNTR_CHAN_MASK, i));
+		/* Tx-fwd transferred count */
+		airoha_qdma_wr(qdma, REG_CNTR_VAL((i << 1) + 1), 0);
+		airoha_qdma_wr(qdma, REG_CNTR_CFG(i << 1),
+			       CNTR_EN_MASK | CNTR_ALL_QUEUE_EN_MASK |
+			       CNTR_ALL_DSCP_RING_EN_MASK |
+			       FIELD_PREP(CNTR_SRC_MASK, 1) |
+			       FIELD_PREP(CNTR_CHAN_MASK, i));
+	}
+}
+
 static int airoha_qdma_hw_init(struct airoha_qdma *qdma)
 {
 	int i;
@@ -2005,6 +2114,7 @@ static int airoha_qdma_hw_init(struct airoha_qdma *qdma)
 
 	airoha_qdma_set(qdma, REG_TXQ_CNGST_CFG,
 			TXQ_CNGST_DROP_EN | TXQ_CNGST_DEI_DROP_EN);
+	airoha_qdma_init_qos_stats(qdma);
 
 	return 0;
 }
@@ -2425,21 +2535,44 @@ static void airoha_dev_get_stats64(struct net_device *dev,
 	} while (u64_stats_fetch_retry(&port->stats.syncp, start));
 }
 
+static u16 airoha_dev_select_queue(struct net_device *dev, struct sk_buff *skb,
+				   struct net_device *sb_dev)
+{
+	struct airoha_gdm_port *port = netdev_priv(dev);
+	int queue, channel;
+
+	/* For dsa device select QoS channel according to the dsa user port
+	 * index, rely on port id otherwise. Select QoS queue based on the
+	 * skb priority.
+	 */
+	channel = netdev_uses_dsa(dev) ? skb_get_queue_mapping(skb) : port->id;
+	channel = channel % AIROHA_NUM_QOS_CHANNELS;
+	queue = (skb->priority - 1) % AIROHA_NUM_QOS_QUEUES; /* QoS queue */
+	queue = channel * AIROHA_NUM_QOS_QUEUES + queue;
+
+	return queue < dev->num_tx_queues ? queue : 0;
+}
+
 static netdev_tx_t airoha_dev_xmit(struct sk_buff *skb,
 				   struct net_device *dev)
 {
 	struct skb_shared_info *sinfo = skb_shinfo(skb);
 	struct airoha_gdm_port *port = netdev_priv(dev);
-	u32 msg0 = 0, msg1, len = skb_headlen(skb);
-	int i, qid = skb_get_queue_mapping(skb);
+	u32 msg0, msg1, len = skb_headlen(skb);
 	struct airoha_qdma *qdma = port->qdma;
 	u32 nr_frags = 1 + sinfo->nr_frags;
 	struct netdev_queue *txq;
 	struct airoha_queue *q;
 	void *data = skb->data;
+	int i, qid;
 	u16 index;
 	u8 fport;
 
+	qid = skb_get_queue_mapping(skb) % ARRAY_SIZE(qdma->q_tx);
+	msg0 = FIELD_PREP(QDMA_ETH_TXMSG_CHAN_MASK,
+			  qid / AIROHA_NUM_QOS_QUEUES) |
+	       FIELD_PREP(QDMA_ETH_TXMSG_QUEUE_MASK,
+			  qid % AIROHA_NUM_QOS_QUEUES);
 	if (skb->ip_summed == CHECKSUM_PARTIAL)
 		msg0 |= FIELD_PREP(QDMA_ETH_TXMSG_TCO_MASK, 1) |
 			FIELD_PREP(QDMA_ETH_TXMSG_UCO_MASK, 1) |
@@ -2609,13 +2742,386 @@ airoha_ethtool_get_rmon_stats(struct net_device *dev,
 	} while (u64_stats_fetch_retry(&port->stats.syncp, start));
 }
 
+static int airoha_qdma_set_chan_tx_sched(struct airoha_gdm_port *port,
+					 int channel, enum tx_sched_mode mode,
+					 const u16 *weights, u8 n_weights)
+{
+	int i;
+
+	for (i = 0; i < AIROHA_NUM_TX_RING; i++)
+		airoha_qdma_clear(port->qdma, REG_QUEUE_CLOSE_CFG(channel),
+				  TXQ_DISABLE_CHAN_QUEUE_MASK(channel, i));
+
+	for (i = 0; i < n_weights; i++) {
+		u32 status;
+		int err;
+
+		airoha_qdma_wr(port->qdma, REG_TXWRR_WEIGHT_CFG,
+			       TWRR_RW_CMD_MASK |
+			       FIELD_PREP(TWRR_CHAN_IDX_MASK, channel) |
+			       FIELD_PREP(TWRR_QUEUE_IDX_MASK, i) |
+			       FIELD_PREP(TWRR_VALUE_MASK, weights[i]));
+		err = read_poll_timeout(airoha_qdma_rr, status,
+					status & TWRR_RW_CMD_DONE,
+					USEC_PER_MSEC, 10 * USEC_PER_MSEC,
+					true, port->qdma,
+					REG_TXWRR_WEIGHT_CFG);
+		if (err)
+			return err;
+	}
+
+	airoha_qdma_rmw(port->qdma, REG_CHAN_QOS_MODE(channel >> 3),
+			CHAN_QOS_MODE_MASK(channel),
+			mode << __ffs(CHAN_QOS_MODE_MASK(channel)));
+
+	return 0;
+}
+
+static int airoha_qdma_set_tx_prio_sched(struct airoha_gdm_port *port,
+					 int channel)
+{
+	static const u16 w[AIROHA_NUM_QOS_QUEUES] = {};
+
+	return airoha_qdma_set_chan_tx_sched(port, channel, TC_SCH_SP, w,
+					     ARRAY_SIZE(w));
+}
+
+static int airoha_qdma_set_tx_ets_sched(struct airoha_gdm_port *port,
+					int channel,
+					struct tc_ets_qopt_offload *opt)
+{
+	struct tc_ets_qopt_offload_replace_params *p = &opt->replace_params;
+	enum tx_sched_mode mode = TC_SCH_SP;
+	u16 w[AIROHA_NUM_QOS_QUEUES] = {};
+	int i, nstrict = 0;
+
+	if (p->bands > AIROHA_NUM_QOS_QUEUES)
+		return -EINVAL;
+
+	for (i = 0; i < p->bands; i++) {
+		if (!p->quanta[i])
+			nstrict++;
+	}
+
+	/* this configuration is not supported by the hw */
+	if (nstrict == AIROHA_NUM_QOS_QUEUES - 1)
+		return -EINVAL;
+
+	for (i = 0; i < p->bands - nstrict; i++)
+		w[i] = p->weights[nstrict + i];
+
+	if (!nstrict)
+		mode = TC_SCH_WRR8;
+	else if (nstrict < AIROHA_NUM_QOS_QUEUES - 1)
+		mode = nstrict + 1;
+
+	return airoha_qdma_set_chan_tx_sched(port, channel, mode, w,
+					     ARRAY_SIZE(w));
+}
+
+static int airoha_qdma_get_tx_ets_stats(struct airoha_gdm_port *port,
+					int channel,
+					struct tc_ets_qopt_offload *opt)
+{
+	u64 cpu_tx_packets = airoha_qdma_rr(port->qdma,
+					    REG_CNTR_VAL(channel << 1));
+	u64 fwd_tx_packets = airoha_qdma_rr(port->qdma,
+					    REG_CNTR_VAL((channel << 1) + 1));
+	u64 tx_packets = (cpu_tx_packets - port->cpu_tx_packets) +
+			 (fwd_tx_packets - port->fwd_tx_packets);
+	_bstats_update(opt->stats.bstats, 0, tx_packets);
+
+	port->cpu_tx_packets = cpu_tx_packets;
+	port->fwd_tx_packets = fwd_tx_packets;
+
+	return 0;
+}
+
+static int airoha_tc_setup_qdisc_ets(struct airoha_gdm_port *port,
+				     struct tc_ets_qopt_offload *opt)
+{
+	int channel;
+
+	if (opt->parent == TC_H_ROOT)
+		return -EINVAL;
+
+	channel = TC_H_MAJ(opt->handle) >> 16;
+	channel = channel % AIROHA_NUM_QOS_CHANNELS;
+
+	switch (opt->command) {
+	case TC_ETS_REPLACE:
+		return airoha_qdma_set_tx_ets_sched(port, channel, opt);
+	case TC_ETS_DESTROY:
+		/* PRIO is default qdisc scheduler */
+		return airoha_qdma_set_tx_prio_sched(port, channel);
+	case TC_ETS_STATS:
+		return airoha_qdma_get_tx_ets_stats(port, channel, opt);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int airoha_qdma_get_trtcm_param(struct airoha_qdma *qdma, int channel,
+				       u32 addr, enum trtcm_param_type param,
+				       enum trtcm_mode_type mode,
+				       u32 *val_low, u32 *val_high)
+{
+	u32 idx = QDMA_METER_IDX(channel), group = QDMA_METER_GROUP(channel);
+	u32 val, config = FIELD_PREP(TRTCM_PARAM_TYPE_MASK, param) |
+			  FIELD_PREP(TRTCM_METER_GROUP_MASK, group) |
+			  FIELD_PREP(TRTCM_PARAM_INDEX_MASK, idx) |
+			  FIELD_PREP(TRTCM_PARAM_RATE_TYPE_MASK, mode);
+
+	airoha_qdma_wr(qdma, REG_TRTCM_CFG_PARAM(addr), config);
+	if (read_poll_timeout(airoha_qdma_rr, val,
+			      val & TRTCM_PARAM_RW_DONE_MASK,
+			      USEC_PER_MSEC, 10 * USEC_PER_MSEC, true,
+			      qdma, REG_TRTCM_CFG_PARAM(addr)))
+		return -ETIMEDOUT;
+
+	*val_low = airoha_qdma_rr(qdma, REG_TRTCM_DATA_LOW(addr));
+	if (val_high)
+		*val_high = airoha_qdma_rr(qdma, REG_TRTCM_DATA_HIGH(addr));
+
+	return 0;
+}
+
+static int airoha_qdma_set_trtcm_param(struct airoha_qdma *qdma, int channel,
+				       u32 addr, enum trtcm_param_type param,
+				       enum trtcm_mode_type mode, u32 val)
+{
+	u32 idx = QDMA_METER_IDX(channel), group = QDMA_METER_GROUP(channel);
+	u32 config = TRTCM_PARAM_RW_MASK |
+		     FIELD_PREP(TRTCM_PARAM_TYPE_MASK, param) |
+		     FIELD_PREP(TRTCM_METER_GROUP_MASK, group) |
+		     FIELD_PREP(TRTCM_PARAM_INDEX_MASK, idx) |
+		     FIELD_PREP(TRTCM_PARAM_RATE_TYPE_MASK, mode);
+
+	airoha_qdma_wr(qdma, REG_TRTCM_DATA_LOW(addr), val);
+	airoha_qdma_wr(qdma, REG_TRTCM_CFG_PARAM(addr), config);
+
+	return read_poll_timeout(airoha_qdma_rr, val,
+				 val & TRTCM_PARAM_RW_DONE_MASK,
+				 USEC_PER_MSEC, 10 * USEC_PER_MSEC, true,
+				 qdma, REG_TRTCM_CFG_PARAM(addr));
+}
+
+static int airoha_qdma_set_trtcm_config(struct airoha_qdma *qdma, int channel,
+					u32 addr, enum trtcm_mode_type mode,
+					bool enable, u32 enable_mask)
+{
+	u32 val;
+
+	if (airoha_qdma_get_trtcm_param(qdma, channel, addr, TRTCM_MISC_MODE,
+					mode, &val, NULL))
+		return -EINVAL;
+
+	val = enable ? val | enable_mask : val & ~enable_mask;
+
+	return airoha_qdma_set_trtcm_param(qdma, channel, addr, TRTCM_MISC_MODE,
+					   mode, val);
+}
+
+static int airoha_qdma_set_trtcm_token_bucket(struct airoha_qdma *qdma,
+					      int channel, u32 addr,
+					      enum trtcm_mode_type mode,
+					      u32 rate_val, u32 bucket_size)
+{
+	u32 val, config, tick, unit, rate, rate_frac;
+	int err;
+
+	if (airoha_qdma_get_trtcm_param(qdma, channel, addr, TRTCM_MISC_MODE,
+					mode, &config, NULL))
+		return -EINVAL;
+
+	val = airoha_qdma_rr(qdma, addr);
+	tick = FIELD_GET(INGRESS_FAST_TICK_MASK, val);
+	if (config & TRTCM_TICK_SEL)
+		tick *= FIELD_GET(INGRESS_SLOW_TICK_RATIO_MASK, val);
+	if (!tick)
+		return -EINVAL;
+
+	unit = (config & TRTCM_PKT_MODE) ? 1000000 / tick : 8000 / tick;
+	if (!unit)
+		return -EINVAL;
+
+	rate = rate_val / unit;
+	rate_frac = rate_val % unit;
+	rate_frac = FIELD_PREP(TRTCM_TOKEN_RATE_MASK, rate_frac) / unit;
+	rate = FIELD_PREP(TRTCM_TOKEN_RATE_MASK, rate) |
+	       FIELD_PREP(TRTCM_TOKEN_RATE_FRACTION_MASK, rate_frac);
+
+	err = airoha_qdma_set_trtcm_param(qdma, channel, addr,
+					  TRTCM_TOKEN_RATE_MODE, mode, rate);
+	if (err)
+		return err;
+
+	val = max_t(u32, bucket_size, MIN_TOKEN_SIZE);
+	val = min_t(u32, __fls(val), MAX_TOKEN_SIZE_OFFSET);
+
+	return airoha_qdma_set_trtcm_param(qdma, channel, addr,
+					   TRTCM_BUCKETSIZE_SHIFT_MODE,
+					   mode, val);
+}
+
+static int airoha_qdma_set_tx_rate_limit(struct airoha_gdm_port *port,
+					 int channel, u32 rate,
+					 u32 bucket_size)
+{
+	int i, err;
+
+	for (i = 0; i <= TRTCM_PEAK_MODE; i++) {
+		err = airoha_qdma_set_trtcm_config(port->qdma, channel,
+						   REG_EGRESS_TRTCM_CFG, i,
+						   !!rate, TRTCM_METER_MODE);
+		if (err)
+			return err;
+
+		err = airoha_qdma_set_trtcm_token_bucket(port->qdma, channel,
+							 REG_EGRESS_TRTCM_CFG,
+							 i, rate, bucket_size);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int airoha_tc_htb_alloc_leaf_queue(struct airoha_gdm_port *port,
+					  struct tc_htb_qopt_offload *opt)
+{
+	u32 channel = TC_H_MIN(opt->classid) % AIROHA_NUM_QOS_CHANNELS;
+	u32 rate = div_u64(opt->rate, 1000) << 3; /* kbps */
+	struct net_device *dev = port->dev;
+	int num_tx_queues = dev->real_num_tx_queues;
+	int err;
+
+	if (opt->parent_classid != TC_HTB_CLASSID_ROOT) {
+		NL_SET_ERR_MSG_MOD(opt->extack, "invalid parent classid");
+		return -EINVAL;
+	}
+
+	err = airoha_qdma_set_tx_rate_limit(port, channel, rate, opt->quantum);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(opt->extack,
+				   "failed configuring htb offload");
+		return err;
+	}
+
+	if (opt->command == TC_HTB_NODE_MODIFY)
+		return 0;
+
+	err = netif_set_real_num_tx_queues(dev, num_tx_queues + 1);
+	if (err) {
+		airoha_qdma_set_tx_rate_limit(port, channel, 0, opt->quantum);
+		NL_SET_ERR_MSG_MOD(opt->extack,
+				   "failed setting real_num_tx_queues");
+		return err;
+	}
+
+	set_bit(channel, port->qos_sq_bmap);
+	opt->qid = AIROHA_NUM_TX_RING + channel;
+
+	return 0;
+}
+
+static void airoha_tc_remove_htb_queue(struct airoha_gdm_port *port, int queue)
+{
+	struct net_device *dev = port->dev;
+
+	netif_set_real_num_tx_queues(dev, dev->real_num_tx_queues - 1);
+	airoha_qdma_set_tx_rate_limit(port, queue + 1, 0, 0);
+	clear_bit(queue, port->qos_sq_bmap);
+}
+
+static int airoha_tc_htb_delete_leaf_queue(struct airoha_gdm_port *port,
+					   struct tc_htb_qopt_offload *opt)
+{
+	u32 channel = TC_H_MIN(opt->classid) % AIROHA_NUM_QOS_CHANNELS;
+
+	if (!test_bit(channel, port->qos_sq_bmap)) {
+		NL_SET_ERR_MSG_MOD(opt->extack, "invalid queue id");
+		return -EINVAL;
+	}
+
+	airoha_tc_remove_htb_queue(port, channel);
+
+	return 0;
+}
+
+static int airoha_tc_htb_destroy(struct airoha_gdm_port *port)
+{
+	int q;
+
+	for_each_set_bit(q, port->qos_sq_bmap, AIROHA_NUM_QOS_CHANNELS)
+		airoha_tc_remove_htb_queue(port, q);
+
+	return 0;
+}
+
+static int airoha_tc_get_htb_get_leaf_queue(struct airoha_gdm_port *port,
+					    struct tc_htb_qopt_offload *opt)
+{
+	u32 channel = TC_H_MIN(opt->classid) % AIROHA_NUM_QOS_CHANNELS;
+
+	if (!test_bit(channel, port->qos_sq_bmap)) {
+		NL_SET_ERR_MSG_MOD(opt->extack, "invalid queue id");
+		return -EINVAL;
+	}
+
+	opt->qid = channel;
+
+	return 0;
+}
+
+static int airoha_tc_setup_qdisc_htb(struct airoha_gdm_port *port,
+				     struct tc_htb_qopt_offload *opt)
+{
+	switch (opt->command) {
+	case TC_HTB_CREATE:
+		break;
+	case TC_HTB_DESTROY:
+		return airoha_tc_htb_destroy(port);
+	case TC_HTB_NODE_MODIFY:
+	case TC_HTB_LEAF_ALLOC_QUEUE:
+		return airoha_tc_htb_alloc_leaf_queue(port, opt);
+	case TC_HTB_LEAF_DEL:
+	case TC_HTB_LEAF_DEL_LAST:
+	case TC_HTB_LEAF_DEL_LAST_FORCE:
+		return airoha_tc_htb_delete_leaf_queue(port, opt);
+	case TC_HTB_LEAF_QUERY_QUEUE:
+		return airoha_tc_get_htb_get_leaf_queue(port, opt);
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int airoha_dev_tc_setup(struct net_device *dev, enum tc_setup_type type,
+			       void *type_data)
+{
+	struct airoha_gdm_port *port = netdev_priv(dev);
+
+	switch (type) {
+	case TC_SETUP_QDISC_ETS:
+		return airoha_tc_setup_qdisc_ets(port, type_data);
+	case TC_SETUP_QDISC_HTB:
+		return airoha_tc_setup_qdisc_htb(port, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 static const struct net_device_ops airoha_netdev_ops = {
 	.ndo_init		= airoha_dev_init,
 	.ndo_open		= airoha_dev_open,
 	.ndo_stop		= airoha_dev_stop,
+	.ndo_select_queue	= airoha_dev_select_queue,
 	.ndo_start_xmit		= airoha_dev_xmit,
 	.ndo_get_stats64        = airoha_dev_get_stats64,
 	.ndo_set_mac_address	= airoha_dev_set_macaddr,
+	.ndo_setup_tc		= airoha_dev_tc_setup,
 };
 
 static const struct ethtool_ops airoha_ethtool_ops = {
@@ -2652,7 +3158,8 @@ static int airoha_alloc_gdm_port(struct airoha_eth *eth, struct device_node *np)
 	}
 
 	dev = devm_alloc_etherdev_mqs(eth->dev, sizeof(*port),
-				      AIROHA_NUM_TX_RING, AIROHA_NUM_RX_RING);
+				      AIROHA_NUM_NETDEV_TX_RINGS,
+				      AIROHA_NUM_RX_RING);
 	if (!dev) {
 		dev_err(eth->dev, "alloc_etherdev failed\n");
 		return -ENOMEM;
@@ -2665,12 +3172,18 @@ static int airoha_alloc_gdm_port(struct airoha_eth *eth, struct device_node *np)
 	dev->watchdog_timeo = 5 * HZ;
 	dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_RXCSUM |
 			   NETIF_F_TSO6 | NETIF_F_IPV6_CSUM |
-			   NETIF_F_SG | NETIF_F_TSO;
+			   NETIF_F_SG | NETIF_F_TSO |
+			   NETIF_F_HW_TC;
 	dev->features |= dev->hw_features;
 	dev->dev.of_node = np;
 	dev->irq = qdma->irq;
 	SET_NETDEV_DEV(dev, eth->dev);
 
+	/* reserve hw queues for HTB offloading */
+	err = netif_set_real_num_tx_queues(dev, AIROHA_NUM_TX_RING);
+	if (err)
+		return err;
+
 	err = of_get_ethdev_address(np, dev);
 	if (err) {
 		if (err == -EPROBE_DEFER)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 10a763e668ed..d9a8817bb33c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -151,8 +151,9 @@ mlx5_core-$(CONFIG_MLX5_HW_STEERING) += steering/hws/cmd.o \
 					steering/hws/bwc.o \
 					steering/hws/debug.o \
 					steering/hws/vport.o \
-					steering/hws/bwc_complex.o
-
+					steering/hws/bwc_complex.o \
+					steering/hws/fs_hws_pools.o \
+					steering/hws/fs_hws.o
 
 #
 # SF device
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 6bd8a18e3af3..e733b81e18a2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -1013,6 +1013,7 @@ static void cmd_work_handler(struct work_struct *work)
 				complete(&ent->done);
 			}
 			up(&cmd->vars.sem);
+			complete(&ent->slotted);
 			return;
 		}
 	} else {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
index ca92e518be76..8489b0a0e8bd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
@@ -94,25 +94,14 @@ static bool mlx5e_ipsec_update_esn_state(struct mlx5e_ipsec_sa_entry *sa_entry)
 	u32 esn, esn_msb;
 	u8 overlap;
 
-	switch (x->xso.type) {
-	case XFRM_DEV_OFFLOAD_PACKET:
-		switch (x->xso.dir) {
-		case XFRM_DEV_OFFLOAD_IN:
-			esn = x->replay_esn->seq;
-			esn_msb = x->replay_esn->seq_hi;
-			break;
-		case XFRM_DEV_OFFLOAD_OUT:
-			esn = x->replay_esn->oseq;
-			esn_msb = x->replay_esn->oseq_hi;
-			break;
-		default:
-			WARN_ON(true);
-			return false;
-		}
-		break;
-	case XFRM_DEV_OFFLOAD_CRYPTO:
-		/* Already parsed by XFRM core */
+	switch (x->xso.dir) {
+	case XFRM_DEV_OFFLOAD_IN:
 		esn = x->replay_esn->seq;
+		esn_msb = x->replay_esn->seq_hi;
+		break;
+	case XFRM_DEV_OFFLOAD_OUT:
+		esn = x->replay_esn->oseq;
+		esn_msb = x->replay_esn->oseq_hi;
 		break;
 	default:
 		WARN_ON(true);
@@ -121,11 +110,15 @@ static bool mlx5e_ipsec_update_esn_state(struct mlx5e_ipsec_sa_entry *sa_entry)
 
 	overlap = sa_entry->esn_state.overlap;
 
-	if (esn >= x->replay_esn->replay_window)
-		seq_bottom = esn - x->replay_esn->replay_window + 1;
+	if (!x->replay_esn->replay_window) {
+		seq_bottom = esn;
+	} else {
+		if (esn >= x->replay_esn->replay_window)
+			seq_bottom = esn - x->replay_esn->replay_window + 1;
 
-	if (x->xso.type == XFRM_DEV_OFFLOAD_CRYPTO)
-		esn_msb = xfrm_replay_seqhi(x, htonl(seq_bottom));
+		if (x->xso.type == XFRM_DEV_OFFLOAD_CRYPTO)
+			esn_msb = xfrm_replay_seqhi(x, htonl(seq_bottom));
+	}
 
 	if (sa_entry->esn_state.esn_msb)
 		sa_entry->esn_state.esn = esn;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 41b5e98a0495..f43fd96a680d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -3535,35 +3535,42 @@ static int mlx5_fs_mode_validate(struct devlink *devlink, u32 id,
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
 	char *value = val.vstr;
-	int err = 0;
+	u8 eswitch_mode;
 
-	if (!strcmp(value, "dmfs")) {
+	if (!strcmp(value, "dmfs"))
 		return 0;
-	} else if (!strcmp(value, "smfs")) {
-		u8 eswitch_mode;
-		bool smfs_cap;
 
-		eswitch_mode = mlx5_eswitch_mode(dev);
-		smfs_cap = mlx5_fs_dr_is_supported(dev);
+	if (!strcmp(value, "smfs")) {
+		bool smfs_cap = mlx5_fs_dr_is_supported(dev);
 
 		if (!smfs_cap) {
-			err = -EOPNOTSUPP;
 			NL_SET_ERR_MSG_MOD(extack,
 					   "Software managed steering is not supported by current device");
+			return -EOPNOTSUPP;
 		}
+	} else if (!strcmp(value, "hmfs")) {
+		bool hmfs_cap = mlx5_fs_hws_is_supported(dev);
 
-		else if (eswitch_mode == MLX5_ESWITCH_OFFLOADS) {
+		if (!hmfs_cap) {
 			NL_SET_ERR_MSG_MOD(extack,
-					   "Software managed steering is not supported when eswitch offloads enabled.");
-			err = -EOPNOTSUPP;
+					   "Hardware steering is not supported by current device");
+			return -EOPNOTSUPP;
 		}
 	} else {
 		NL_SET_ERR_MSG_MOD(extack,
-				   "Bad parameter: supported values are [\"dmfs\", \"smfs\"]");
-		err = -EINVAL;
+				   "Bad parameter: supported values are [\"dmfs\", \"smfs\", \"hmfs\"]");
+		return -EINVAL;
 	}
 
-	return err;
+	eswitch_mode = mlx5_eswitch_mode(dev);
+	if (eswitch_mode == MLX5_ESWITCH_OFFLOADS) {
+		NL_SET_ERR_MSG_FMT_MOD(extack,
+				       "Moving to %s is not supported when eswitch offloads enabled.",
+				       value);
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
 }
 
 static int mlx5_fs_mode_set(struct devlink *devlink, u32 id,
@@ -3575,6 +3582,8 @@ static int mlx5_fs_mode_set(struct devlink *devlink, u32 id,
 
 	if (!strcmp(ctx->val.vstr, "smfs"))
 		mode = MLX5_FLOW_STEERING_MODE_SMFS;
+	else if (!strcmp(ctx->val.vstr, "hmfs"))
+		mode = MLX5_FLOW_STEERING_MODE_HMFS;
 	else
 		mode = MLX5_FLOW_STEERING_MODE_DMFS;
 	dev->priv.steering->mode = mode;
@@ -3587,10 +3596,17 @@ static int mlx5_fs_mode_get(struct devlink *devlink, u32 id,
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
 
-	if (dev->priv.steering->mode == MLX5_FLOW_STEERING_MODE_SMFS)
+	switch (dev->priv.steering->mode) {
+	case MLX5_FLOW_STEERING_MODE_SMFS:
 		strscpy(ctx->val.vstr, "smfs", sizeof(ctx->val.vstr));
-	else
+		break;
+	case MLX5_FLOW_STEERING_MODE_HMFS:
+		strscpy(ctx->val.vstr, "hmfs", sizeof(ctx->val.vstr));
+		break;
+	default:
 		strscpy(ctx->val.vstr, "dmfs", sizeof(ctx->val.vstr));
+	}
+
 	return 0;
 }
 
@@ -4009,6 +4025,8 @@ int mlx5_flow_namespace_set_mode(struct mlx5_flow_namespace *ns,
 
 	if (mode == MLX5_FLOW_STEERING_MODE_SMFS)
 		cmds = mlx5_fs_cmd_get_dr_cmds();
+	else if (mode == MLX5_FLOW_STEERING_MODE_HMFS)
+		cmds = mlx5_fs_cmd_get_hws_cmds();
 	else
 		cmds = mlx5_fs_cmd_get_fw_cmds();
 	if (!cmds)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index bad2df0715ec..20837e526679 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -38,6 +38,7 @@
 #include <linux/rhashtable.h>
 #include <linux/llist.h>
 #include <steering/sws/fs_dr.h>
+#include <steering/hws/fs_hws.h>
 
 #define FDB_TC_MAX_CHAIN 3
 #define FDB_FT_CHAIN (FDB_TC_MAX_CHAIN + 1)
@@ -64,6 +65,7 @@ struct mlx5_modify_hdr {
 	enum mlx5_flow_resource_owner owner;
 	union {
 		struct mlx5_fs_dr_action fs_dr_action;
+		struct mlx5_fs_hws_action fs_hws_action;
 		u32 id;
 	};
 };
@@ -74,6 +76,7 @@ struct mlx5_pkt_reformat {
 	enum mlx5_flow_resource_owner owner;
 	union {
 		struct mlx5_fs_dr_action fs_dr_action;
+		struct mlx5_fs_hws_action fs_hws_action;
 		u32 id;
 	};
 };
@@ -126,7 +129,8 @@ enum fs_fte_status {
 
 enum mlx5_flow_steering_mode {
 	MLX5_FLOW_STEERING_MODE_DMFS,
-	MLX5_FLOW_STEERING_MODE_SMFS
+	MLX5_FLOW_STEERING_MODE_SMFS,
+	MLX5_FLOW_STEERING_MODE_HMFS,
 };
 
 enum mlx5_flow_steering_capabilty {
@@ -190,7 +194,10 @@ struct mlx5_flow_handle {
 /* Type of children is mlx5_flow_group */
 struct mlx5_flow_table {
 	struct fs_node			node;
-	struct mlx5_fs_dr_table		fs_dr_table;
+	union {
+		struct mlx5_fs_dr_table		fs_dr_table;
+		struct mlx5_fs_hws_table	fs_hws_table;
+	};
 	u32				id;
 	u16				vport;
 	unsigned int			max_fte;
@@ -247,7 +254,10 @@ struct fs_fte_dup {
 /* Type of children is mlx5_flow_rule */
 struct fs_fte {
 	struct fs_node			node;
-	struct mlx5_fs_dr_rule		fs_dr_rule;
+	union {
+		struct mlx5_fs_dr_rule		fs_dr_rule;
+		struct mlx5_fs_hws_rule		fs_hws_rule;
+	};
 	u32				val[MLX5_ST_SZ_DW_MATCH_PARAM];
 	struct fs_fte_action		act_dests;
 	struct fs_fte_dup		*dup;
@@ -280,7 +290,10 @@ struct mlx5_flow_group_mask {
 /* Type of children is fs_fte */
 struct mlx5_flow_group {
 	struct fs_node			node;
-	struct mlx5_fs_dr_matcher	fs_dr_matcher;
+	union {
+		struct mlx5_fs_dr_matcher	fs_dr_matcher;
+		struct mlx5_fs_hws_matcher	fs_hws_matcher;
+	};
 	struct mlx5_flow_group_mask	mask;
 	u32				start_index;
 	u32				max_ftes;
@@ -293,7 +306,10 @@ struct mlx5_flow_group {
 struct mlx5_flow_root_namespace {
 	struct mlx5_flow_namespace	ns;
 	enum   mlx5_flow_steering_mode	mode;
-	struct mlx5_fs_dr_domain	fs_dr_domain;
+	union {
+		struct mlx5_fs_dr_domain	fs_dr_domain;
+		struct mlx5_fs_hws_context	fs_hws_context;
+	};
 	enum   fs_flow_table_type	table_type;
 	struct mlx5_core_dev		*dev;
 	struct mlx5_flow_table		*root_ft;
@@ -303,6 +319,42 @@ struct mlx5_flow_root_namespace {
 	const struct mlx5_flow_cmds	*cmds;
 };
 
+enum mlx5_fc_type {
+	MLX5_FC_TYPE_ACQUIRED = 0,
+	MLX5_FC_TYPE_LOCAL,
+};
+
+struct mlx5_fc_cache {
+	u64 packets;
+	u64 bytes;
+	u64 lastuse;
+};
+
+struct mlx5_fc {
+	u32 id;
+	bool aging;
+	enum mlx5_fc_type type;
+	struct mlx5_fc_bulk *bulk;
+	struct mlx5_fc_cache cache;
+	/* last{packets,bytes} are used for calculating deltas since last reading. */
+	u64 lastpackets;
+	u64 lastbytes;
+};
+
+struct mlx5_fc_bulk_hws_data {
+	struct mlx5hws_action *hws_action;
+	struct mutex lock; /* protects hws_action */
+	refcount_t hws_action_refcount;
+};
+
+struct mlx5_fc_bulk {
+	struct mlx5_fs_bulk fs_bulk;
+	u32 base_id;
+	struct mlx5_fc_bulk_hws_data hws_data;
+	struct mlx5_fc fcs[];
+};
+
+u32 mlx5_fc_get_base_id(struct mlx5_fc *counter);
 int mlx5_init_fc_stats(struct mlx5_core_dev *dev);
 void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev);
 void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index d8e1c4ebd364..492775d3d193 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -44,28 +44,6 @@
 #define MLX5_FC_POOL_MAX_THRESHOLD BIT(18)
 #define MLX5_FC_POOL_USED_BUFF_RATIO 10
 
-enum mlx5_fc_type {
-	MLX5_FC_TYPE_ACQUIRED = 0,
-	MLX5_FC_TYPE_LOCAL,
-};
-
-struct mlx5_fc_cache {
-	u64 packets;
-	u64 bytes;
-	u64 lastuse;
-};
-
-struct mlx5_fc {
-	u32 id;
-	bool aging;
-	enum mlx5_fc_type type;
-	struct mlx5_fc_bulk *bulk;
-	struct mlx5_fc_cache cache;
-	/* last{packets,bytes} are used for calculating deltas since last reading. */
-	u64 lastpackets;
-	u64 lastbytes;
-};
-
 struct mlx5_fc_stats {
 	struct xarray counters;
 
@@ -434,13 +412,7 @@ void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev,
 					    fc_stats->sampling_interval);
 }
 
-/* Flow counter bluks */
-
-struct mlx5_fc_bulk {
-	struct mlx5_fs_bulk fs_bulk;
-	u32 base_id;
-	struct mlx5_fc fcs[];
-};
+/* Flow counter bulks */
 
 static void mlx5_fc_init(struct mlx5_fc *counter, struct mlx5_fc_bulk *bulk,
 			 u32 id)
@@ -449,7 +421,13 @@ static void mlx5_fc_init(struct mlx5_fc *counter, struct mlx5_fc_bulk *bulk,
 	counter->id = id;
 }
 
-static struct mlx5_fs_bulk *mlx5_fc_bulk_create(struct mlx5_core_dev *dev)
+u32 mlx5_fc_get_base_id(struct mlx5_fc *counter)
+{
+	return counter->bulk->base_id;
+}
+
+static struct mlx5_fs_bulk *mlx5_fc_bulk_create(struct mlx5_core_dev *dev,
+						void *pool_ctx)
 {
 	enum mlx5_fc_bulk_alloc_bitmask alloc_bitmask;
 	struct mlx5_fc_bulk *fc_bulk;
@@ -473,6 +451,8 @@ static struct mlx5_fs_bulk *mlx5_fc_bulk_create(struct mlx5_core_dev *dev)
 	for (i = 0; i < bulk_len; i++)
 		mlx5_fc_init(&fc_bulk->fcs[i], fc_bulk, base_id + i);
 
+	refcount_set(&fc_bulk->hws_data.hws_action_refcount, 0);
+	mutex_init(&fc_bulk->hws_data.lock);
 	return &fc_bulk->fs_bulk;
 
 fs_bulk_cleanup:
@@ -518,7 +498,7 @@ static const struct mlx5_fs_pool_ops mlx5_fc_pool_ops = {
 static void
 mlx5_fc_pool_init(struct mlx5_fs_pool *fc_pool, struct mlx5_core_dev *dev)
 {
-	mlx5_fs_pool_init(fc_pool, dev, &mlx5_fc_pool_ops);
+	mlx5_fs_pool_init(fc_pool, dev, &mlx5_fc_pool_ops, NULL);
 }
 
 static void mlx5_fc_pool_cleanup(struct mlx5_fs_pool *fc_pool)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_pool.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_pool.c
index b891d7b9e3e0..f6c226664602 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_pool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_pool.c
@@ -56,11 +56,12 @@ static int mlx5_fs_bulk_release_index(struct mlx5_fs_bulk *fs_bulk, int index)
 }
 
 void mlx5_fs_pool_init(struct mlx5_fs_pool *pool, struct mlx5_core_dev *dev,
-		       const struct mlx5_fs_pool_ops *ops)
+		       const struct mlx5_fs_pool_ops *ops, void *pool_ctx)
 {
 	WARN_ON_ONCE(!ops || !ops->bulk_destroy || !ops->bulk_create ||
 		     !ops->update_threshold);
 	pool->dev = dev;
+	pool->pool_ctx = pool_ctx;
 	mutex_init(&pool->pool_lock);
 	INIT_LIST_HEAD(&pool->fully_used);
 	INIT_LIST_HEAD(&pool->partially_used);
@@ -91,7 +92,7 @@ mlx5_fs_pool_alloc_new_bulk(struct mlx5_fs_pool *fs_pool)
 	struct mlx5_core_dev *dev = fs_pool->dev;
 	struct mlx5_fs_bulk *new_bulk;
 
-	new_bulk = fs_pool->ops->bulk_create(dev);
+	new_bulk = fs_pool->ops->bulk_create(dev, fs_pool->pool_ctx);
 	if (new_bulk)
 		fs_pool->available_units += new_bulk->bulk_len;
 	fs_pool->ops->update_threshold(fs_pool);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_pool.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_pool.h
index 3b149863260c..f04ec3107498 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_pool.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_pool.h
@@ -21,7 +21,8 @@ struct mlx5_fs_pool;
 
 struct mlx5_fs_pool_ops {
 	int (*bulk_destroy)(struct mlx5_core_dev *dev, struct mlx5_fs_bulk *bulk);
-	struct mlx5_fs_bulk * (*bulk_create)(struct mlx5_core_dev *dev);
+	struct mlx5_fs_bulk * (*bulk_create)(struct mlx5_core_dev *dev,
+					     void *pool_ctx);
 	void (*update_threshold)(struct mlx5_fs_pool *pool);
 };
 
@@ -44,7 +45,7 @@ void mlx5_fs_bulk_cleanup(struct mlx5_fs_bulk *fs_bulk);
 int mlx5_fs_bulk_get_free_amount(struct mlx5_fs_bulk *bulk);
 
 void mlx5_fs_pool_init(struct mlx5_fs_pool *pool, struct mlx5_core_dev *dev,
-		       const struct mlx5_fs_pool_ops *ops);
+		       const struct mlx5_fs_pool_ops *ops, void *pool_ctx);
 void mlx5_fs_pool_cleanup(struct mlx5_fs_pool *pool);
 int mlx5_fs_pool_acquire_index(struct mlx5_fs_pool *fs_pool,
 			       struct mlx5_fs_pool_index *pool_index);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
index 4822d01123b4..d61a1a9297c9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
@@ -322,17 +322,16 @@ static void mlx5_pps_out(struct work_struct *work)
 	}
 }
 
-static void mlx5_timestamp_overflow(struct work_struct *work)
+static long mlx5_timestamp_overflow(struct ptp_clock_info *ptp_info)
 {
-	struct delayed_work *dwork = to_delayed_work(work);
 	struct mlx5_core_dev *mdev;
 	struct mlx5_timer *timer;
 	struct mlx5_clock *clock;
 	unsigned long flags;
 
-	timer = container_of(dwork, struct mlx5_timer, overflow_work);
-	clock = container_of(timer, struct mlx5_clock, timer);
+	clock = container_of(ptp_info, struct mlx5_clock, ptp_info);
 	mdev = container_of(clock, struct mlx5_core_dev, clock);
+	timer = &clock->timer;
 
 	if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
 		goto out;
@@ -343,7 +342,7 @@ static void mlx5_timestamp_overflow(struct work_struct *work)
 	write_sequnlock_irqrestore(&clock->lock, flags);
 
 out:
-	schedule_delayed_work(&timer->overflow_work, timer->overflow_period);
+	return timer->overflow_period;
 }
 
 static int mlx5_ptp_settime_real_time(struct mlx5_core_dev *mdev,
@@ -517,6 +516,7 @@ static int mlx5_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
 	timer->cycles.mult = mult;
 	mlx5_update_clock_info_page(mdev);
 	write_sequnlock_irqrestore(&clock->lock, flags);
+	ptp_schedule_worker(clock->ptp, timer->overflow_period);
 
 	return 0;
 }
@@ -852,6 +852,7 @@ static const struct ptp_clock_info mlx5_ptp_clock_info = {
 	.settime64	= mlx5_ptp_settime,
 	.enable		= NULL,
 	.verify		= NULL,
+	.do_aux_work	= mlx5_timestamp_overflow,
 };
 
 static int mlx5_query_mtpps_pin_mode(struct mlx5_core_dev *mdev, u8 pin,
@@ -1052,12 +1053,11 @@ static void mlx5_init_overflow_period(struct mlx5_clock *clock)
 	do_div(ns, NSEC_PER_SEC / HZ);
 	timer->overflow_period = ns;
 
-	INIT_DELAYED_WORK(&timer->overflow_work, mlx5_timestamp_overflow);
-	if (timer->overflow_period)
-		schedule_delayed_work(&timer->overflow_work, 0);
-	else
+	if (!timer->overflow_period) {
+		timer->overflow_period = HZ;
 		mlx5_core_warn(mdev,
-			       "invalid overflow period, overflow_work is not scheduled\n");
+			       "invalid overflow period, overflow_work is scheduled once per second\n");
+	}
 
 	if (clock_info)
 		clock_info->overflow_period = timer->overflow_period;
@@ -1172,6 +1172,9 @@ void mlx5_init_clock(struct mlx5_core_dev *mdev)
 
 	MLX5_NB_INIT(&clock->pps_nb, mlx5_pps_event, PPS_EVENT);
 	mlx5_eq_notifier_register(mdev, &clock->pps_nb);
+
+	if (clock->ptp)
+		ptp_schedule_worker(clock->ptp, 0);
 }
 
 void mlx5_cleanup_clock(struct mlx5_core_dev *mdev)
@@ -1188,7 +1191,6 @@ void mlx5_cleanup_clock(struct mlx5_core_dev *mdev)
 	}
 
 	cancel_work_sync(&clock->pps_info.out_work);
-	cancel_delayed_work_sync(&clock->timer.overflow_work);
 
 	if (mdev->clock_info) {
 		free_page((unsigned long)mdev->clock_info);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
index a897cdc60fdb..b5332c54d4fb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
@@ -11,31 +11,29 @@
 /* This is the longest supported action sequence for FDB table:
  * DECAP, POP_VLAN, MODIFY, CTR, ASO, PUSH_VLAN, MODIFY, ENCAP, Term.
  */
-static const u32 action_order_arr[MLX5HWS_TABLE_TYPE_MAX][MLX5HWS_ACTION_TYP_MAX] = {
-	[MLX5HWS_TABLE_TYPE_FDB] = {
-		BIT(MLX5HWS_ACTION_TYP_REMOVE_HEADER) |
-		BIT(MLX5HWS_ACTION_TYP_REFORMAT_TNL_L2_TO_L2) |
-		BIT(MLX5HWS_ACTION_TYP_REFORMAT_TNL_L3_TO_L2),
-		BIT(MLX5HWS_ACTION_TYP_POP_VLAN),
-		BIT(MLX5HWS_ACTION_TYP_POP_VLAN),
-		BIT(MLX5HWS_ACTION_TYP_MODIFY_HDR),
-		BIT(MLX5HWS_ACTION_TYP_PUSH_VLAN),
-		BIT(MLX5HWS_ACTION_TYP_PUSH_VLAN),
-		BIT(MLX5HWS_ACTION_TYP_INSERT_HEADER) |
-		BIT(MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L2) |
-		BIT(MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L3),
-		BIT(MLX5HWS_ACTION_TYP_CTR),
-		BIT(MLX5HWS_ACTION_TYP_TAG),
-		BIT(MLX5HWS_ACTION_TYP_ASO_METER),
-		BIT(MLX5HWS_ACTION_TYP_MODIFY_HDR),
-		BIT(MLX5HWS_ACTION_TYP_TBL) |
-		BIT(MLX5HWS_ACTION_TYP_VPORT) |
-		BIT(MLX5HWS_ACTION_TYP_DROP) |
-		BIT(MLX5HWS_ACTION_TYP_SAMPLER) |
-		BIT(MLX5HWS_ACTION_TYP_RANGE) |
-		BIT(MLX5HWS_ACTION_TYP_DEST_ARRAY),
-		BIT(MLX5HWS_ACTION_TYP_LAST),
-	},
+static const u32 action_order_arr[MLX5HWS_ACTION_TYP_MAX] = {
+	BIT(MLX5HWS_ACTION_TYP_REMOVE_HEADER) |
+	BIT(MLX5HWS_ACTION_TYP_REFORMAT_TNL_L2_TO_L2) |
+	BIT(MLX5HWS_ACTION_TYP_REFORMAT_TNL_L3_TO_L2),
+	BIT(MLX5HWS_ACTION_TYP_POP_VLAN),
+	BIT(MLX5HWS_ACTION_TYP_POP_VLAN),
+	BIT(MLX5HWS_ACTION_TYP_MODIFY_HDR),
+	BIT(MLX5HWS_ACTION_TYP_PUSH_VLAN),
+	BIT(MLX5HWS_ACTION_TYP_PUSH_VLAN),
+	BIT(MLX5HWS_ACTION_TYP_INSERT_HEADER) |
+	BIT(MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L2) |
+	BIT(MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L3),
+	BIT(MLX5HWS_ACTION_TYP_CTR),
+	BIT(MLX5HWS_ACTION_TYP_TAG),
+	BIT(MLX5HWS_ACTION_TYP_ASO_METER),
+	BIT(MLX5HWS_ACTION_TYP_MODIFY_HDR),
+	BIT(MLX5HWS_ACTION_TYP_TBL) |
+	BIT(MLX5HWS_ACTION_TYP_VPORT) |
+	BIT(MLX5HWS_ACTION_TYP_DROP) |
+	BIT(MLX5HWS_ACTION_TYP_SAMPLER) |
+	BIT(MLX5HWS_ACTION_TYP_RANGE) |
+	BIT(MLX5HWS_ACTION_TYP_DEST_ARRAY),
+	BIT(MLX5HWS_ACTION_TYP_LAST),
 };
 
 static const char * const mlx5hws_action_type_str[] = {
@@ -83,8 +81,8 @@ static int hws_action_get_shared_stc_nic(struct mlx5hws_context *ctx,
 	int ret;
 
 	mutex_lock(&ctx->ctrl_lock);
-	if (ctx->common_res[tbl_type].shared_stc[stc_type]) {
-		ctx->common_res[tbl_type].shared_stc[stc_type]->refcount++;
+	if (ctx->common_res.shared_stc[stc_type]) {
+		ctx->common_res.shared_stc[stc_type]->refcount++;
 		mutex_unlock(&ctx->ctrl_lock);
 		return 0;
 	}
@@ -124,8 +122,8 @@ static int hws_action_get_shared_stc_nic(struct mlx5hws_context *ctx,
 		goto free_shared_stc;
 	}
 
-	ctx->common_res[tbl_type].shared_stc[stc_type] = shared_stc;
-	ctx->common_res[tbl_type].shared_stc[stc_type]->refcount = 1;
+	ctx->common_res.shared_stc[stc_type] = shared_stc;
+	ctx->common_res.shared_stc[stc_type]->refcount = 1;
 
 	mutex_unlock(&ctx->ctrl_lock);
 
@@ -178,16 +176,16 @@ static void hws_action_put_shared_stc(struct mlx5hws_action *action,
 	}
 
 	mutex_lock(&ctx->ctrl_lock);
-	if (--ctx->common_res[tbl_type].shared_stc[stc_type]->refcount) {
+	if (--ctx->common_res.shared_stc[stc_type]->refcount) {
 		mutex_unlock(&ctx->ctrl_lock);
 		return;
 	}
 
-	shared_stc = ctx->common_res[tbl_type].shared_stc[stc_type];
+	shared_stc = ctx->common_res.shared_stc[stc_type];
 
 	mlx5hws_action_free_single_stc(ctx, tbl_type, &shared_stc->stc_chunk);
 	kfree(shared_stc);
-	ctx->common_res[tbl_type].shared_stc[stc_type] = NULL;
+	ctx->common_res.shared_stc[stc_type] = NULL;
 	mutex_unlock(&ctx->ctrl_lock);
 }
 
@@ -206,10 +204,10 @@ bool mlx5hws_action_check_combo(struct mlx5hws_context *ctx,
 				enum mlx5hws_action_type *user_actions,
 				enum mlx5hws_table_type table_type)
 {
-	const u32 *order_arr = action_order_arr[table_type];
+	const u32 *order_arr = action_order_arr;
+	bool valid_combo;
 	u8 order_idx = 0;
 	u8 user_idx = 0;
-	bool valid_combo;
 
 	if (table_type >= MLX5HWS_TABLE_TYPE_MAX) {
 		mlx5hws_err(ctx, "Invalid table_type %d", table_type);
@@ -321,8 +319,8 @@ int mlx5hws_action_alloc_single_stc(struct mlx5hws_context *ctx,
 __must_hold(&ctx->ctrl_lock)
 {
 	struct mlx5hws_cmd_stc_modify_attr cleanup_stc_attr = {0};
-	struct mlx5hws_pool *stc_pool = ctx->stc_pool[table_type];
 	struct mlx5hws_cmd_stc_modify_attr fixup_stc_attr = {0};
+	struct mlx5hws_pool *stc_pool = ctx->stc_pool;
 	bool use_fixup;
 	u32 obj_0_id;
 	int ret;
@@ -387,8 +385,8 @@ void mlx5hws_action_free_single_stc(struct mlx5hws_context *ctx,
 				    struct mlx5hws_pool_chunk *stc)
 __must_hold(&ctx->ctrl_lock)
 {
-	struct mlx5hws_pool *stc_pool = ctx->stc_pool[table_type];
 	struct mlx5hws_cmd_stc_modify_attr stc_attr = {0};
+	struct mlx5hws_pool *stc_pool = ctx->stc_pool;
 	u32 obj_id;
 
 	/* Modify the STC not to point to an object */
@@ -473,6 +471,7 @@ static void hws_action_fill_stc_attr(struct mlx5hws_action *action,
 		break;
 	case MLX5HWS_ACTION_TYP_TBL:
 	case MLX5HWS_ACTION_TYP_DEST_ARRAY:
+	case MLX5HWS_ACTION_TYP_SAMPLER:
 		attr->action_type = MLX5_IFC_STC_ACTION_TYPE_JUMP_TO_FT;
 		attr->action_offset = MLX5HWS_ACTION_OFFSET_HIT;
 		attr->dest_table_id = obj_id;
@@ -561,7 +560,7 @@ hws_action_create_stcs(struct mlx5hws_action *action, u32 obj_id)
 	if (action->flags & MLX5HWS_ACTION_FLAG_HWS_FDB) {
 		ret = mlx5hws_action_alloc_single_stc(ctx, &stc_attr,
 						      MLX5HWS_TABLE_TYPE_FDB,
-						      &action->stc[MLX5HWS_TABLE_TYPE_FDB]);
+						      &action->stc);
 		if (ret)
 			goto out_err;
 	}
@@ -585,7 +584,7 @@ hws_action_destroy_stcs(struct mlx5hws_action *action)
 
 	if (action->flags & MLX5HWS_ACTION_FLAG_HWS_FDB)
 		mlx5hws_action_free_single_stc(ctx, MLX5HWS_TABLE_TYPE_FDB,
-					       &action->stc[MLX5HWS_TABLE_TYPE_FDB]);
+					       &action->stc);
 
 	mutex_unlock(&ctx->ctrl_lock);
 }
@@ -1639,8 +1638,8 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx,
 	rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(MLX5HWS_TABLE_TYPE_FDB, false);
 
 	/* STC is a single resource (obj_id), use any STC for the ID */
-	stc_pool = ctx->stc_pool[MLX5HWS_TABLE_TYPE_FDB];
-	default_stc = ctx->common_res[MLX5HWS_TABLE_TYPE_FDB].default_stc;
+	stc_pool = ctx->stc_pool;
+	default_stc = ctx->common_res.default_stc;
 	obj_id = mlx5hws_pool_chunk_get_base_id(stc_pool, &default_stc->default_hit);
 	rtc_attr.stc_base = obj_id;
 
@@ -1731,7 +1730,7 @@ hws_action_create_dest_match_range_fill_table(struct mlx5hws_context *ctx,
 	ste_attr.used_id_rtc_0 = &used_rtc_0_id;
 	ste_attr.used_id_rtc_1 = &used_rtc_1_id;
 
-	common_res = &ctx->common_res[MLX5HWS_TABLE_TYPE_FDB];
+	common_res = &ctx->common_res;
 
 	/* init an empty match STE which will always hit */
 	ste_attr.wqe_ctrl = &wqe_ctrl;
@@ -1750,7 +1749,7 @@ hws_action_create_dest_match_range_fill_table(struct mlx5hws_context *ctx,
 	wqe_ctrl.stc_ix[MLX5HWS_ACTION_STC_IDX_CTRL] |=
 		htonl(MLX5HWS_ACTION_STC_IDX_LAST_COMBO2 << 29);
 	wqe_ctrl.stc_ix[MLX5HWS_ACTION_STC_IDX_HIT] =
-		htonl(hit_ft_action->stc[MLX5HWS_TABLE_TYPE_FDB].offset);
+		htonl(hit_ft_action->stc.offset);
 
 	wqe_data_arr = (__force __be32 *)&range_wqe_data;
 
@@ -1843,7 +1842,7 @@ mlx5hws_action_create_dest_match_range(struct mlx5hws_context *ctx,
 	stc_attr.ste_table.match_definer_id = ctx->caps->trivial_match_definer;
 
 	ret = mlx5hws_action_alloc_single_stc(ctx, &stc_attr, MLX5HWS_TABLE_TYPE_FDB,
-					      &action->stc[MLX5HWS_TABLE_TYPE_FDB]);
+					      &action->stc);
 	if (ret)
 		goto error_unlock;
 
@@ -1875,7 +1874,50 @@ struct mlx5hws_action *
 mlx5hws_action_create_flow_sampler(struct mlx5hws_context *ctx,
 				   u32 sampler_id, u32 flags)
 {
-	mlx5hws_err(ctx, "Flow sampler action - unsupported\n");
+	struct mlx5hws_cmd_ft_create_attr ft_attr = {0};
+	struct mlx5hws_cmd_set_fte_attr fte_attr = {0};
+	struct mlx5hws_cmd_forward_tbl *fw_island;
+	struct mlx5hws_cmd_set_fte_dest dest;
+	struct mlx5hws_action *action;
+	int ret;
+
+	if (flags != (MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED)) {
+		mlx5hws_err(ctx, "Unsupported flags for flow sampler\n");
+		return NULL;
+	}
+
+	ft_attr.type = FS_FT_FDB;
+	ft_attr.level = ctx->caps->fdb_ft.max_level - 1;
+
+	dest.destination_type = MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER;
+	dest.destination_id = sampler_id;
+
+	fte_attr.dests_num = 1;
+	fte_attr.dests = &dest;
+	fte_attr.action_flags = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+	fte_attr.ignore_flow_level = 1;
+
+	fw_island = mlx5hws_cmd_forward_tbl_create(ctx->mdev, &ft_attr, &fte_attr);
+	if (!fw_island)
+		return NULL;
+
+	action = hws_action_create_generic(ctx, flags,
+					   MLX5HWS_ACTION_TYP_SAMPLER);
+	if (!action)
+		goto destroy_fw_island;
+
+	ret = hws_action_create_stcs(action, fw_island->ft_id);
+	if (ret)
+		goto free_action;
+
+	action->flow_sampler.fw_island = fw_island;
+
+	return action;
+
+free_action:
+	kfree(action);
+destroy_fw_island:
+	mlx5hws_cmd_forward_tbl_destroy(ctx->mdev, fw_island);
 	return NULL;
 }
 
@@ -1914,6 +1956,11 @@ static void hws_action_destroy_hws(struct mlx5hws_action *action)
 		}
 		kfree(action->dest_array.dest_list);
 		break;
+	case MLX5HWS_ACTION_TYP_SAMPLER:
+		hws_action_destroy_stcs(action);
+		mlx5hws_cmd_forward_tbl_destroy(action->ctx->mdev,
+						action->flow_sampler.fw_island);
+		break;
 	case MLX5HWS_ACTION_TYP_REFORMAT_TNL_L3_TO_L2:
 	case MLX5HWS_ACTION_TYP_MODIFY_HDR:
 		shared_arg = false;
@@ -1970,8 +2017,8 @@ __must_hold(&ctx->ctrl_lock)
 	struct mlx5hws_action_default_stc *default_stc;
 	int ret;
 
-	if (ctx->common_res[tbl_type].default_stc) {
-		ctx->common_res[tbl_type].default_stc->refcount++;
+	if (ctx->common_res.default_stc) {
+		ctx->common_res.default_stc->refcount++;
 		return 0;
 	}
 
@@ -2023,8 +2070,8 @@ __must_hold(&ctx->ctrl_lock)
 		goto free_nop_dw7;
 	}
 
-	ctx->common_res[tbl_type].default_stc = default_stc;
-	ctx->common_res[tbl_type].default_stc->refcount++;
+	ctx->common_res.default_stc = default_stc;
+	ctx->common_res.default_stc->refcount++;
 
 	return 0;
 
@@ -2046,9 +2093,7 @@ __must_hold(&ctx->ctrl_lock)
 {
 	struct mlx5hws_action_default_stc *default_stc;
 
-	default_stc = ctx->common_res[tbl_type].default_stc;
-
-	default_stc = ctx->common_res[tbl_type].default_stc;
+	default_stc = ctx->common_res.default_stc;
 	if (--default_stc->refcount)
 		return;
 
@@ -2058,7 +2103,7 @@ __must_hold(&ctx->ctrl_lock)
 	mlx5hws_action_free_single_stc(ctx, tbl_type, &default_stc->nop_dw5);
 	mlx5hws_action_free_single_stc(ctx, tbl_type, &default_stc->nop_ctr);
 	kfree(default_stc);
-	ctx->common_res[tbl_type].default_stc = NULL;
+	ctx->common_res.default_stc = NULL;
 }
 
 static void hws_action_modify_write(struct mlx5hws_send_engine *queue,
@@ -2150,8 +2195,7 @@ hws_action_apply_stc(struct mlx5hws_actions_apply_data *apply,
 {
 	struct mlx5hws_action *action = apply->rule_action[action_idx].action;
 
-	apply->wqe_ctrl->stc_ix[stc_idx] =
-		htonl(action->stc[apply->tbl_type].offset);
+	apply->wqe_ctrl->stc_ix[stc_idx] = htonl(action->stc.offset);
 }
 
 static void
@@ -2181,7 +2225,7 @@ hws_action_setter_modify_header(struct mlx5hws_actions_apply_data *apply,
 	rule_action = &apply->rule_action[setter->idx_double];
 	action = rule_action->action;
 
-	stc_idx = htonl(action->stc[apply->tbl_type].offset);
+	stc_idx = htonl(action->stc.offset);
 	apply->wqe_ctrl->stc_ix[MLX5HWS_ACTION_STC_IDX_DW6] = stc_idx;
 	apply->wqe_ctrl->stc_ix[MLX5HWS_ACTION_STC_IDX_DW7] = 0;
 
@@ -2240,7 +2284,7 @@ hws_action_setter_insert_ptr(struct mlx5hws_actions_apply_data *apply,
 	apply->wqe_data[MLX5HWS_ACTION_OFFSET_DW6] = 0;
 	apply->wqe_data[MLX5HWS_ACTION_OFFSET_DW7] = htonl(arg_idx);
 
-	stc_idx = htonl(action->stc[apply->tbl_type].offset);
+	stc_idx = htonl(action->stc.offset);
 	apply->wqe_ctrl->stc_ix[MLX5HWS_ACTION_STC_IDX_DW6] = stc_idx;
 	apply->wqe_ctrl->stc_ix[MLX5HWS_ACTION_STC_IDX_DW7] = 0;
 
@@ -2272,7 +2316,7 @@ hws_action_setter_tnl_l3_to_l2(struct mlx5hws_actions_apply_data *apply,
 	apply->wqe_data[MLX5HWS_ACTION_OFFSET_DW6] = 0;
 	apply->wqe_data[MLX5HWS_ACTION_OFFSET_DW7] = htonl(arg_idx);
 
-	stc_idx = htonl(action->stc[apply->tbl_type].offset);
+	stc_idx = htonl(action->stc.offset);
 	apply->wqe_ctrl->stc_ix[MLX5HWS_ACTION_STC_IDX_DW6] = stc_idx;
 	apply->wqe_ctrl->stc_ix[MLX5HWS_ACTION_STC_IDX_DW7] = 0;
 
@@ -2434,6 +2478,7 @@ int mlx5hws_action_template_process(struct mlx5hws_action_template *at)
 		case MLX5HWS_ACTION_TYP_DROP:
 		case MLX5HWS_ACTION_TYP_TBL:
 		case MLX5HWS_ACTION_TYP_DEST_ARRAY:
+		case MLX5HWS_ACTION_TYP_SAMPLER:
 		case MLX5HWS_ACTION_TYP_VPORT:
 		case MLX5HWS_ACTION_TYP_MISS:
 			/* Hit action */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h
index e8f562c31826..64b76075f7f8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h
@@ -70,12 +70,12 @@ struct mlx5hws_action_default_stc {
 	struct mlx5hws_pool_chunk nop_dw6;
 	struct mlx5hws_pool_chunk nop_dw7;
 	struct mlx5hws_pool_chunk default_hit;
-	u32 refcount;
+	u32 refcount; /* protected by context ctrl lock */
 };
 
 struct mlx5hws_action_shared_stc {
 	struct mlx5hws_pool_chunk stc_chunk;
-	u32 refcount;
+	u32 refcount; /* protected by context ctrl lock */
 };
 
 struct mlx5hws_actions_apply_data {
@@ -124,7 +124,7 @@ struct mlx5hws_action {
 	struct mlx5hws_context *ctx;
 	union {
 		struct {
-			struct mlx5hws_pool_chunk stc[MLX5HWS_TABLE_TYPE_MAX];
+			struct mlx5hws_pool_chunk stc;
 			union {
 				struct {
 					u32 pat_id;
@@ -166,6 +166,9 @@ struct mlx5hws_action {
 					struct mlx5hws_cmd_set_fte_dest *dest_list;
 				} dest_array;
 				struct {
+					struct mlx5hws_cmd_forward_tbl *fw_island;
+				} flow_sampler;
+				struct {
 					u8 type;
 					u8 start_anchor;
 					u8 end_anchor;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
index baacf662c0ab..a8d886e92144 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
@@ -152,6 +152,8 @@ mlx5hws_bwc_matcher_create(struct mlx5hws_table *table,
 	if (!bwc_matcher)
 		return NULL;
 
+	atomic_set(&bwc_matcher->num_of_rules, 0);
+
 	/* Check if the required match params can be all matched
 	 * in single STE, otherwise complex matcher is needed.
 	 */
@@ -199,10 +201,12 @@ int mlx5hws_bwc_matcher_destroy_simple(struct mlx5hws_bwc_matcher *bwc_matcher)
 
 int mlx5hws_bwc_matcher_destroy(struct mlx5hws_bwc_matcher *bwc_matcher)
 {
-	if (bwc_matcher->num_of_rules)
+	u32 num_of_rules = atomic_read(&bwc_matcher->num_of_rules);
+
+	if (num_of_rules)
 		mlx5hws_err(bwc_matcher->matcher->tbl->ctx,
 			    "BWC matcher destroy: matcher still has %d rules\n",
-			    bwc_matcher->num_of_rules);
+			    num_of_rules);
 
 	mlx5hws_bwc_matcher_destroy_simple(bwc_matcher);
 
@@ -215,6 +219,8 @@ static int hws_bwc_queue_poll(struct mlx5hws_context *ctx,
 			      u32 *pending_rules,
 			      bool drain)
 {
+	unsigned long timeout = jiffies +
+				msecs_to_jiffies(MLX5HWS_BWC_POLLING_TIMEOUT * MSEC_PER_SEC);
 	struct mlx5hws_flow_op_result comp[MLX5HWS_BWC_MATCHER_REHASH_BURST_TH];
 	u16 burst_th = hws_bwc_get_burst_th(ctx, queue_id);
 	bool got_comp = *pending_rules >= burst_th;
@@ -250,6 +256,11 @@ static int hws_bwc_queue_poll(struct mlx5hws_context *ctx,
 		}
 
 		got_comp = !!ret;
+
+		if (unlikely(!got_comp && time_after(jiffies, timeout))) {
+			mlx5hws_err(ctx, "BWC poll error: polling queue %d - TIMEOUT\n", queue_id);
+			return -ETIMEDOUT;
+		}
 	}
 
 	return err;
@@ -309,7 +320,7 @@ static void hws_bwc_rule_list_add(struct mlx5hws_bwc_rule *bwc_rule, u16 idx)
 {
 	struct mlx5hws_bwc_matcher *bwc_matcher = bwc_rule->bwc_matcher;
 
-	bwc_matcher->num_of_rules++;
+	atomic_inc(&bwc_matcher->num_of_rules);
 	bwc_rule->bwc_queue_idx = idx;
 	list_add(&bwc_rule->list_node, &bwc_matcher->rules[idx]);
 }
@@ -318,7 +329,7 @@ static void hws_bwc_rule_list_remove(struct mlx5hws_bwc_rule *bwc_rule)
 {
 	struct mlx5hws_bwc_matcher *bwc_matcher = bwc_rule->bwc_matcher;
 
-	bwc_matcher->num_of_rules--;
+	atomic_dec(&bwc_matcher->num_of_rules);
 	list_del_init(&bwc_rule->list_node);
 }
 
@@ -334,22 +345,21 @@ hws_bwc_rule_destroy_hws_sync(struct mlx5hws_bwc_rule *bwc_rule,
 			      struct mlx5hws_rule_attr *rule_attr)
 {
 	struct mlx5hws_context *ctx = bwc_rule->bwc_matcher->matcher->tbl->ctx;
-	struct mlx5hws_flow_op_result completion;
+	u32 expected_completions = 1;
 	int ret;
 
 	ret = hws_bwc_rule_destroy_hws_async(bwc_rule, rule_attr);
 	if (unlikely(ret))
 		return ret;
 
-	do {
-		ret = mlx5hws_send_queue_poll(ctx, rule_attr->queue_id, &completion, 1);
-	} while (ret != 1);
-
-	if (unlikely(completion.status != MLX5HWS_FLOW_OP_SUCCESS ||
-		     (bwc_rule->rule->status != MLX5HWS_RULE_STATUS_DELETED &&
-		      bwc_rule->rule->status != MLX5HWS_RULE_STATUS_DELETING))) {
-		mlx5hws_err(ctx, "Failed destroying BWC rule: completion %d, rule status %d\n",
-			    completion.status, bwc_rule->rule->status);
+	ret = hws_bwc_queue_poll(ctx, rule_attr->queue_id, &expected_completions, true);
+	if (unlikely(ret))
+		return ret;
+
+	if (unlikely(bwc_rule->rule->status != MLX5HWS_RULE_STATUS_DELETED &&
+		     bwc_rule->rule->status != MLX5HWS_RULE_STATUS_DELETING)) {
+		mlx5hws_err(ctx, "Failed destroying BWC rule: rule status %d\n",
+			    bwc_rule->rule->status);
 		return -EINVAL;
 	}
 
@@ -615,8 +625,12 @@ static int hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_match
 
 				ret = hws_bwc_queue_poll(ctx, rule_attr.queue_id,
 							 &pending_rules[i], false);
-				if (unlikely(ret))
+				if (unlikely(ret)) {
+					mlx5hws_err(ctx,
+						    "Moving BWC rule failed during rehash (%d)\n",
+						    ret);
 					goto free_bwc_rules;
+				}
 			}
 		}
 	} while (!all_done);
@@ -629,8 +643,11 @@ static int hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_match
 			mlx5hws_send_engine_flush_queue(&ctx->send_queue[queue_id]);
 			ret = hws_bwc_queue_poll(ctx, queue_id,
 						 &pending_rules[i], true);
-			if (unlikely(ret))
+			if (unlikely(ret)) {
+				mlx5hws_err(ctx,
+					    "Moving BWC rule failed during rehash (%d)\n", ret);
 				goto free_bwc_rules;
+			}
 		}
 	}
 
@@ -704,7 +721,8 @@ hws_bwc_matcher_rehash_size(struct mlx5hws_bwc_matcher *bwc_matcher)
 	 * Need to check again if we really need rehash.
 	 * If the reason for rehash was size, but not any more - skip rehash.
 	 */
-	if (!hws_bwc_matcher_rehash_size_needed(bwc_matcher, bwc_matcher->num_of_rules))
+	if (!hws_bwc_matcher_rehash_size_needed(bwc_matcher,
+						atomic_read(&bwc_matcher->num_of_rules)))
 		return 0;
 
 	/* Now we're done all the checking - do the rehash:
@@ -797,7 +815,7 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule,
 	}
 
 	/* check if number of rules require rehash */
-	num_of_rules = bwc_matcher->num_of_rules;
+	num_of_rules = atomic_read(&bwc_matcher->num_of_rules);
 
 	if (unlikely(hws_bwc_matcher_rehash_size_needed(bwc_matcher, num_of_rules))) {
 		mutex_unlock(queue_lock);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h
index 3d4965213b01..f9f569131dde 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h
@@ -8,10 +8,18 @@
 #define MLX5HWS_BWC_MATCHER_SIZE_LOG_STEP 1
 #define MLX5HWS_BWC_MATCHER_REHASH_PERCENT_TH 70
 #define MLX5HWS_BWC_MATCHER_REHASH_BURST_TH 32
-#define MLX5HWS_BWC_MATCHER_ATTACH_AT_NUM 255
+
+/* Max number of AT attach operations for the same matcher.
+ * When the limit is reached, next attempt to attach new AT
+ * will result in creation of a new matcher and moving all
+ * the rules to this matcher.
+ */
+#define MLX5HWS_BWC_MATCHER_ATTACH_AT_NUM 8
 
 #define MLX5HWS_BWC_MAX_ACTS 16
 
+#define MLX5HWS_BWC_POLLING_TIMEOUT 60
+
 struct mlx5hws_bwc_matcher {
 	struct mlx5hws_matcher *matcher;
 	struct mlx5hws_match_template *mt;
@@ -19,7 +27,7 @@ struct mlx5hws_bwc_matcher {
 	u8 num_of_at;
 	u16 priority;
 	u8 size_log;
-	u32 num_of_rules; /* atomically accessed */
+	atomic_t num_of_rules;
 	struct list_head *rules;
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c
index c00c138c3366..487e75476b0a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c
@@ -257,6 +257,12 @@ int mlx5hws_cmd_set_fte(struct mlx5_core_dev *mdev,
 						 dest->ext_reformat_id);
 				}
 				break;
+			case MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER:
+				MLX5_SET(dest_format, in_dests,
+					 destination_type, ifc_dest_type);
+				MLX5_SET(dest_format, in_dests, destination_id,
+					 dest->destination_id);
+				break;
 			default:
 				ret = -EOPNOTSUPP;
 				goto out;
@@ -359,7 +365,7 @@ void mlx5hws_cmd_set_attr_connect_miss_tbl(struct mlx5hws_context *ctx,
 	ft_attr->type = fw_ft_type;
 	ft_attr->table_miss_action = MLX5_IFC_MODIFY_FLOW_TABLE_MISS_ACTION_GOTO_TBL;
 
-	default_miss_tbl = ctx->common_res[type].default_miss->ft_id;
+	default_miss_tbl = ctx->common_res.default_miss->ft_id;
 	if (!default_miss_tbl) {
 		pr_warn("HWS: no flow table ID for default miss\n");
 		return;
@@ -622,12 +628,12 @@ int mlx5hws_cmd_arg_create(struct mlx5_core_dev *mdev,
 			   u32 pd,
 			   u32 *arg_id)
 {
+	u32 in[MLX5_ST_SZ_DW(create_modify_header_arg_in)] = {0};
 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {0};
-	u32 in[MLX5_ST_SZ_DW(create_arg_in)] = {0};
 	void *attr;
 	int ret;
 
-	attr = MLX5_ADDR_OF(create_arg_in, in, hdr);
+	attr = MLX5_ADDR_OF(create_modify_header_arg_in, in, hdr);
 	MLX5_SET(general_obj_in_cmd_hdr,
 		 attr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
 	MLX5_SET(general_obj_in_cmd_hdr,
@@ -635,8 +641,8 @@ int mlx5hws_cmd_arg_create(struct mlx5_core_dev *mdev,
 	MLX5_SET(general_obj_in_cmd_hdr,
 		 attr, op_param.create.log_obj_range, log_obj_range);
 
-	attr = MLX5_ADDR_OF(create_arg_in, in, arg);
-	MLX5_SET(arg, attr, access_pd, pd);
+	attr = MLX5_ADDR_OF(create_modify_header_arg_in, in, arg);
+	MLX5_SET(modify_header_arg, attr, access_pd, pd);
 
 	ret = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
 	if (ret) {
@@ -812,7 +818,7 @@ int mlx5hws_cmd_packet_reformat_create(struct mlx5_core_dev *mdev,
 				       struct mlx5hws_cmd_packet_reformat_create_attr *attr,
 				       u32 *reformat_id)
 {
-	u32 out[MLX5_ST_SZ_DW(alloc_packet_reformat_out)] = {0};
+	u32 out[MLX5_ST_SZ_DW(alloc_packet_reformat_context_out)] = {0};
 	size_t insz, cmd_data_sz, cmd_total_sz;
 	void *prctx;
 	void *pdata;
@@ -845,7 +851,7 @@ int mlx5hws_cmd_packet_reformat_create(struct mlx5_core_dev *mdev,
 		goto out;
 	}
 
-	*reformat_id = MLX5_GET(alloc_packet_reformat_out, out, packet_reformat_id);
+	*reformat_id = MLX5_GET(alloc_packet_reformat_context_out, out, packet_reformat_id);
 out:
 	kfree(in);
 	return ret;
@@ -854,13 +860,13 @@ out:
 int mlx5hws_cmd_packet_reformat_destroy(struct mlx5_core_dev *mdev,
 					u32 reformat_id)
 {
-	u32 out[MLX5_ST_SZ_DW(dealloc_packet_reformat_out)] = {0};
-	u32 in[MLX5_ST_SZ_DW(dealloc_packet_reformat_in)] = {0};
+	u32 out[MLX5_ST_SZ_DW(dealloc_packet_reformat_context_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(dealloc_packet_reformat_context_in)] = {0};
 	int ret;
 
-	MLX5_SET(dealloc_packet_reformat_in, in, opcode,
+	MLX5_SET(dealloc_packet_reformat_context_in, in, opcode,
 		 MLX5_CMD_OP_DEALLOC_PACKET_REFORMAT_CONTEXT);
-	MLX5_SET(dealloc_packet_reformat_in, in,
+	MLX5_SET(dealloc_packet_reformat_context_in, in,
 		 packet_reformat_id, reformat_id);
 
 	ret = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
@@ -889,73 +895,6 @@ int mlx5hws_cmd_sq_modify_rdy(struct mlx5_core_dev *mdev, u32 sqn)
 	return ret;
 }
 
-int mlx5hws_cmd_allow_other_vhca_access(struct mlx5_core_dev *mdev,
-					struct mlx5hws_cmd_allow_other_vhca_access_attr *attr)
-{
-	u32 out[MLX5_ST_SZ_DW(allow_other_vhca_access_out)] = {0};
-	u32 in[MLX5_ST_SZ_DW(allow_other_vhca_access_in)] = {0};
-	void *key;
-	int ret;
-
-	MLX5_SET(allow_other_vhca_access_in,
-		 in, opcode, MLX5_CMD_OP_ALLOW_OTHER_VHCA_ACCESS);
-	MLX5_SET(allow_other_vhca_access_in,
-		 in, object_type_to_be_accessed, attr->obj_type);
-	MLX5_SET(allow_other_vhca_access_in,
-		 in, object_id_to_be_accessed, attr->obj_id);
-
-	key = MLX5_ADDR_OF(allow_other_vhca_access_in, in, access_key);
-	memcpy(key, attr->access_key, sizeof(attr->access_key));
-
-	ret = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
-	if (ret)
-		mlx5_core_err(mdev, "Failed to execute ALLOW_OTHER_VHCA_ACCESS command\n");
-
-	return ret;
-}
-
-int mlx5hws_cmd_alias_obj_create(struct mlx5_core_dev *mdev,
-				 struct mlx5hws_cmd_alias_obj_create_attr *alias_attr,
-				 u32 *obj_id)
-{
-	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {0};
-	u32 in[MLX5_ST_SZ_DW(create_alias_obj_in)] = {0};
-	void *attr;
-	void *key;
-	int ret;
-
-	attr = MLX5_ADDR_OF(create_alias_obj_in, in, hdr);
-	MLX5_SET(general_obj_in_cmd_hdr,
-		 attr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
-	MLX5_SET(general_obj_in_cmd_hdr,
-		 attr, obj_type, alias_attr->obj_type);
-	MLX5_SET(general_obj_in_cmd_hdr, attr, op_param.create.alias_object, 1);
-
-	attr = MLX5_ADDR_OF(create_alias_obj_in, in, alias_ctx);
-	MLX5_SET(alias_context, attr, vhca_id_to_be_accessed, alias_attr->vhca_id);
-	MLX5_SET(alias_context, attr, object_id_to_be_accessed, alias_attr->obj_id);
-
-	key = MLX5_ADDR_OF(alias_context, attr, access_key);
-	memcpy(key, alias_attr->access_key, sizeof(alias_attr->access_key));
-
-	ret = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
-	if (ret) {
-		mlx5_core_err(mdev, "Failed to create ALIAS OBJ\n");
-		goto out;
-	}
-
-	*obj_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
-out:
-	return ret;
-}
-
-int mlx5hws_cmd_alias_obj_destroy(struct mlx5_core_dev *mdev,
-				  u16 obj_type,
-				  u32 obj_id)
-{
-	return hws_cmd_general_obj_destroy(mdev, obj_type, obj_id);
-}
-
 int mlx5hws_cmd_generate_wqe(struct mlx5_core_dev *mdev,
 			     struct mlx5hws_cmd_generate_wqe_attr *attr,
 			     struct mlx5_cqe64 *ret_cqe)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h
index 434f62b0904e..610c63d81ad9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h
@@ -63,7 +63,7 @@ struct mlx5hws_cmd_forward_tbl {
 	u8 type;
 	u32 ft_id;
 	u32 fg_id;
-	u32 refcount;
+	u32 refcount; /* protected by context ctrl lock */
 };
 
 struct mlx5hws_cmd_rtc_create_attr {
@@ -334,14 +334,6 @@ mlx5hws_cmd_forward_tbl_create(struct mlx5_core_dev *mdev,
 void mlx5hws_cmd_forward_tbl_destroy(struct mlx5_core_dev *mdev,
 				     struct mlx5hws_cmd_forward_tbl *tbl);
 
-int mlx5hws_cmd_alias_obj_create(struct mlx5_core_dev *mdev,
-				 struct mlx5hws_cmd_alias_obj_create_attr *alias_attr,
-				 u32 *obj_id);
-
-int mlx5hws_cmd_alias_obj_destroy(struct mlx5_core_dev *mdev,
-				  u16 obj_type,
-				  u32 obj_id);
-
 int mlx5hws_cmd_sq_modify_rdy(struct mlx5_core_dev *mdev, u32 sqn);
 
 int mlx5hws_cmd_query_caps(struct mlx5_core_dev *mdev,
@@ -352,9 +344,6 @@ void mlx5hws_cmd_set_attr_connect_miss_tbl(struct mlx5hws_context *ctx,
 					   enum mlx5hws_table_type type,
 					   struct mlx5hws_cmd_ft_modify_attr *ft_attr);
 
-int mlx5hws_cmd_allow_other_vhca_access(struct mlx5_core_dev *mdev,
-					struct mlx5hws_cmd_allow_other_vhca_access_attr *attr);
-
 int mlx5hws_cmd_query_gvmi(struct mlx5_core_dev *mdev, bool other_function,
 			   u16 vport_number, u16 *gvmi);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c
index 4a8928f33bb9..9cda2774fd64 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c
@@ -23,7 +23,6 @@ static int hws_context_pools_init(struct mlx5hws_context *ctx)
 	struct mlx5hws_pool_attr pool_attr = {0};
 	u8 max_log_sz;
 	int ret;
-	int i;
 
 	ret = mlx5hws_pat_init_pattern_cache(&ctx->pattern_cache);
 	if (ret)
@@ -39,23 +38,17 @@ static int hws_context_pools_init(struct mlx5hws_context *ctx)
 	max_log_sz = min(MLX5HWS_POOL_STC_LOG_SZ, ctx->caps->stc_alloc_log_max);
 	pool_attr.alloc_log_sz = max(max_log_sz, ctx->caps->stc_alloc_log_gran);
 
-	for (i = 0; i < MLX5HWS_TABLE_TYPE_MAX; i++) {
-		pool_attr.table_type = i;
-		ctx->stc_pool[i] = mlx5hws_pool_create(ctx, &pool_attr);
-		if (!ctx->stc_pool[i]) {
-			mlx5hws_err(ctx, "Failed to allocate STC pool [%d]", i);
-			ret = -ENOMEM;
-			goto free_stc_pools;
-		}
+	pool_attr.table_type = MLX5HWS_TABLE_TYPE_FDB;
+	ctx->stc_pool = mlx5hws_pool_create(ctx, &pool_attr);
+	if (!ctx->stc_pool) {
+		mlx5hws_err(ctx, "Failed to allocate STC pool\n");
+		ret = -ENOMEM;
+		goto uninit_cache;
 	}
 
 	return 0;
 
-free_stc_pools:
-	for (i = 0; i < MLX5HWS_TABLE_TYPE_MAX; i++)
-		if (ctx->stc_pool[i])
-			mlx5hws_pool_destroy(ctx->stc_pool[i]);
-
+uninit_cache:
 	mlx5hws_definer_uninit_cache(ctx->definer_cache);
 uninit_pat_cache:
 	mlx5hws_pat_uninit_pattern_cache(ctx->pattern_cache);
@@ -64,12 +57,8 @@ uninit_pat_cache:
 
 static void hws_context_pools_uninit(struct mlx5hws_context *ctx)
 {
-	int i;
-
-	for (i = 0; i < MLX5HWS_TABLE_TYPE_MAX; i++) {
-		if (ctx->stc_pool[i])
-			mlx5hws_pool_destroy(ctx->stc_pool[i]);
-	}
+	if (ctx->stc_pool)
+		mlx5hws_pool_destroy(ctx->stc_pool);
 
 	mlx5hws_definer_uninit_cache(ctx->definer_cache);
 	mlx5hws_pat_uninit_pattern_cache(ctx->pattern_cache);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h
index 1c9cc4fba083..38c3647444ad 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h
@@ -38,8 +38,8 @@ struct mlx5hws_context {
 	struct mlx5_core_dev *mdev;
 	struct mlx5hws_cmd_query_caps *caps;
 	u32 pd_num;
-	struct mlx5hws_pool *stc_pool[MLX5HWS_TABLE_TYPE_MAX];
-	struct mlx5hws_context_common_res common_res[MLX5HWS_TABLE_TYPE_MAX];
+	struct mlx5hws_pool *stc_pool;
+	struct mlx5hws_context_common_res common_res;
 	struct mlx5hws_pattern_cache *pattern_cache;
 	struct mlx5hws_definer_cache *definer_cache;
 	struct mutex ctrl_lock; /* control lock to protect the whole context */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c
index 5b200b4bc1a8..696275fd0ce2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c
@@ -148,8 +148,8 @@ static int hws_debug_dump_matcher(struct seq_file *f, struct mlx5hws_matcher *ma
 		   matcher->match_ste.rtc_1_id,
 		   (int)ste_1_id);
 
-	ste = &matcher->action_ste[0].ste;
-	ste_pool = matcher->action_ste[0].pool;
+	ste = &matcher->action_ste.ste;
+	ste_pool = matcher->action_ste.pool;
 	if (ste_pool) {
 		ste_0_id = mlx5hws_pool_chunk_get_base_id(ste_pool, ste);
 		if (tbl_type == MLX5HWS_TABLE_TYPE_FDB)
@@ -171,10 +171,8 @@ static int hws_debug_dump_matcher(struct seq_file *f, struct mlx5hws_matcher *ma
 		return ret;
 
 	seq_printf(f, ",%d,%d,%d,%d,%d,0x%llx,0x%llx\n",
-		   matcher->action_ste[0].rtc_0_id,
-		   (int)ste_0_id,
-		   matcher->action_ste[0].rtc_1_id,
-		   (int)ste_1_id,
+		   matcher->action_ste.rtc_0_id, (int)ste_0_id,
+		   matcher->action_ste.rtc_1_id, (int)ste_1_id,
 		   0,
 		   mlx5hws_debug_icm_to_idx(icm_addr_0),
 		   mlx5hws_debug_icm_to_idx(icm_addr_1));
@@ -368,9 +366,10 @@ static int hws_debug_dump_context_info(struct seq_file *f, struct mlx5hws_contex
 
 static int hws_debug_dump_context_stc_resource(struct seq_file *f,
 					       struct mlx5hws_context *ctx,
-					       u32 tbl_type,
 					       struct mlx5hws_pool_resource *resource)
 {
+	u32 tbl_type = MLX5HWS_TABLE_TYPE_BASE + MLX5HWS_TABLE_TYPE_FDB;
+
 	seq_printf(f, "%d,0x%llx,%u,%u\n",
 		   MLX5HWS_DEBUG_RES_TYPE_CONTEXT_STC,
 		   HWS_PTR_TO_ID(ctx),
@@ -382,31 +381,22 @@ static int hws_debug_dump_context_stc_resource(struct seq_file *f,
 
 static int hws_debug_dump_context_stc(struct seq_file *f, struct mlx5hws_context *ctx)
 {
-	struct mlx5hws_pool *stc_pool;
-	u32 table_type;
+	struct mlx5hws_pool *stc_pool = ctx->stc_pool;
 	int ret;
-	int i;
 
-	for (i = 0; i < MLX5HWS_TABLE_TYPE_MAX; i++) {
-		stc_pool = ctx->stc_pool[i];
-		table_type = MLX5HWS_TABLE_TYPE_BASE + i;
-
-		if (!stc_pool)
-			continue;
+	if (!stc_pool)
+		return 0;
 
-		if (stc_pool->resource[0]) {
-			ret = hws_debug_dump_context_stc_resource(f, ctx, table_type,
-								  stc_pool->resource[0]);
-			if (ret)
-				return ret;
-		}
+	if (stc_pool->resource[0]) {
+		ret = hws_debug_dump_context_stc_resource(f, ctx, stc_pool->resource[0]);
+		if (ret)
+			return ret;
+	}
 
-		if (i == MLX5HWS_TABLE_TYPE_FDB && stc_pool->mirror_resource[0]) {
-			ret = hws_debug_dump_context_stc_resource(f, ctx, table_type,
-								  stc_pool->mirror_resource[0]);
-			if (ret)
-				return ret;
-		}
+	if (stc_pool->mirror_resource[0]) {
+		ret = hws_debug_dump_context_stc_resource(f, ctx, stc_pool->mirror_resource[0]);
+		if (ret)
+			return ret;
 	}
 
 	return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c
index 8fe96eb76baf..10ece7df1cfa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c
@@ -70,7 +70,7 @@
 			u32 second_dw_mask = (mask) & ((1 << _bit_off) - 1); \
 			_HWS_SET32(p, (v) >> _bit_off, byte_off, 0, (mask) >> _bit_off); \
 			_HWS_SET32(p, (v) & second_dw_mask, (byte_off) + DW_SIZE, \
-				    (bit_off) % BITS_IN_DW, second_dw_mask); \
+				    (bit_off + BITS_IN_DW) % BITS_IN_DW, second_dw_mask); \
 		} else { \
 			_HWS_SET32(p, v, byte_off, (bit_off), (mask)); \
 		} \
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.h
index 9432d5084def..5c1a2086efba 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.h
@@ -785,7 +785,7 @@ struct mlx5hws_definer_cache {
 
 struct mlx5hws_definer_cache_item {
 	struct mlx5hws_definer definer;
-	u32 refcount;
+	u32 refcount; /* protected by context ctrl lock */
 	struct list_head list_node;
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c
new file mode 100644
index 000000000000..05329afeb9ea
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c
@@ -0,0 +1,1377 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2025 NVIDIA Corporation & Affiliates */
+
+#include <linux/mlx5/vport.h>
+#include <mlx5_core.h>
+#include <fs_core.h>
+#include <fs_cmd.h>
+#include "fs_hws_pools.h"
+#include "mlx5hws.h"
+
+#define MLX5HWS_CTX_MAX_NUM_OF_QUEUES 16
+#define MLX5HWS_CTX_QUEUE_SIZE 256
+
+static struct mlx5hws_action *
+mlx5_fs_create_action_remove_header_vlan(struct mlx5hws_context *ctx);
+static void
+mlx5_fs_destroy_pr_pool(struct mlx5_fs_pool *pool, struct xarray *pr_pools,
+			unsigned long index);
+static void
+mlx5_fs_destroy_mh_pool(struct mlx5_fs_pool *pool, struct xarray *mh_pools,
+			unsigned long index);
+
+static int mlx5_fs_init_hws_actions_pool(struct mlx5_core_dev *dev,
+					 struct mlx5_fs_hws_context *fs_ctx)
+{
+	u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED;
+	struct mlx5_fs_hws_actions_pool *hws_pool = &fs_ctx->hws_pool;
+	struct mlx5hws_action_reformat_header reformat_hdr = {};
+	struct mlx5hws_context *ctx = fs_ctx->hws_ctx;
+	enum mlx5hws_action_type action_type;
+	int err = -ENOSPC;
+
+	hws_pool->tag_action = mlx5hws_action_create_tag(ctx, flags);
+	if (!hws_pool->tag_action)
+		return err;
+	hws_pool->pop_vlan_action = mlx5hws_action_create_pop_vlan(ctx, flags);
+	if (!hws_pool->pop_vlan_action)
+		goto destroy_tag;
+	hws_pool->push_vlan_action = mlx5hws_action_create_push_vlan(ctx, flags);
+	if (!hws_pool->push_vlan_action)
+		goto destroy_pop_vlan;
+	hws_pool->drop_action = mlx5hws_action_create_dest_drop(ctx, flags);
+	if (!hws_pool->drop_action)
+		goto destroy_push_vlan;
+	action_type = MLX5HWS_ACTION_TYP_REFORMAT_TNL_L2_TO_L2;
+	hws_pool->decapl2_action =
+		mlx5hws_action_create_reformat(ctx, action_type, 1,
+					       &reformat_hdr, 0, flags);
+	if (!hws_pool->decapl2_action)
+		goto destroy_drop;
+	hws_pool->remove_hdr_vlan_action =
+		mlx5_fs_create_action_remove_header_vlan(ctx);
+	if (!hws_pool->remove_hdr_vlan_action)
+		goto destroy_decapl2;
+	err = mlx5_fs_hws_pr_pool_init(&hws_pool->insert_hdr_pool, dev, 0,
+				       MLX5HWS_ACTION_TYP_INSERT_HEADER);
+	if (err)
+		goto destroy_remove_hdr;
+	err = mlx5_fs_hws_pr_pool_init(&hws_pool->dl3tnltol2_pool, dev, 0,
+				       MLX5HWS_ACTION_TYP_REFORMAT_TNL_L3_TO_L2);
+	if (err)
+		goto cleanup_insert_hdr;
+	xa_init(&hws_pool->el2tol3tnl_pools);
+	xa_init(&hws_pool->el2tol2tnl_pools);
+	xa_init(&hws_pool->mh_pools);
+	xa_init(&hws_pool->table_dests);
+	xa_init(&hws_pool->vport_dests);
+	xa_init(&hws_pool->vport_vhca_dests);
+	return 0;
+
+cleanup_insert_hdr:
+	mlx5_fs_hws_pr_pool_cleanup(&hws_pool->insert_hdr_pool);
+destroy_remove_hdr:
+	mlx5hws_action_destroy(hws_pool->remove_hdr_vlan_action);
+destroy_decapl2:
+	mlx5hws_action_destroy(hws_pool->decapl2_action);
+destroy_drop:
+	mlx5hws_action_destroy(hws_pool->drop_action);
+destroy_push_vlan:
+	mlx5hws_action_destroy(hws_pool->push_vlan_action);
+destroy_pop_vlan:
+	mlx5hws_action_destroy(hws_pool->pop_vlan_action);
+destroy_tag:
+	mlx5hws_action_destroy(hws_pool->tag_action);
+	return err;
+}
+
+static void mlx5_fs_cleanup_hws_actions_pool(struct mlx5_fs_hws_context *fs_ctx)
+{
+	struct mlx5_fs_hws_actions_pool *hws_pool = &fs_ctx->hws_pool;
+	struct mlx5hws_action *action;
+	struct mlx5_fs_pool *pool;
+	unsigned long i;
+
+	xa_for_each(&hws_pool->vport_vhca_dests, i, action)
+		mlx5hws_action_destroy(action);
+	xa_destroy(&hws_pool->vport_vhca_dests);
+	xa_for_each(&hws_pool->vport_dests, i, action)
+		mlx5hws_action_destroy(action);
+	xa_destroy(&hws_pool->vport_dests);
+	xa_destroy(&hws_pool->table_dests);
+	xa_for_each(&hws_pool->mh_pools, i, pool)
+		mlx5_fs_destroy_mh_pool(pool, &hws_pool->mh_pools, i);
+	xa_destroy(&hws_pool->mh_pools);
+	xa_for_each(&hws_pool->el2tol2tnl_pools, i, pool)
+		mlx5_fs_destroy_pr_pool(pool, &hws_pool->el2tol2tnl_pools, i);
+	xa_destroy(&hws_pool->el2tol2tnl_pools);
+	xa_for_each(&hws_pool->el2tol3tnl_pools, i, pool)
+		mlx5_fs_destroy_pr_pool(pool, &hws_pool->el2tol3tnl_pools, i);
+	xa_destroy(&hws_pool->el2tol3tnl_pools);
+	mlx5_fs_hws_pr_pool_cleanup(&hws_pool->dl3tnltol2_pool);
+	mlx5_fs_hws_pr_pool_cleanup(&hws_pool->insert_hdr_pool);
+	mlx5hws_action_destroy(hws_pool->remove_hdr_vlan_action);
+	mlx5hws_action_destroy(hws_pool->decapl2_action);
+	mlx5hws_action_destroy(hws_pool->drop_action);
+	mlx5hws_action_destroy(hws_pool->push_vlan_action);
+	mlx5hws_action_destroy(hws_pool->pop_vlan_action);
+	mlx5hws_action_destroy(hws_pool->tag_action);
+}
+
+static int mlx5_cmd_hws_create_ns(struct mlx5_flow_root_namespace *ns)
+{
+	struct mlx5hws_context_attr hws_ctx_attr = {};
+	int err;
+
+	hws_ctx_attr.queues = min_t(int, num_online_cpus(),
+				    MLX5HWS_CTX_MAX_NUM_OF_QUEUES);
+	hws_ctx_attr.queue_size = MLX5HWS_CTX_QUEUE_SIZE;
+
+	ns->fs_hws_context.hws_ctx =
+		mlx5hws_context_open(ns->dev, &hws_ctx_attr);
+	if (!ns->fs_hws_context.hws_ctx) {
+		mlx5_core_err(ns->dev, "Failed to create hws flow namespace\n");
+		return -EINVAL;
+	}
+	err = mlx5_fs_init_hws_actions_pool(ns->dev, &ns->fs_hws_context);
+	if (err) {
+		mlx5_core_err(ns->dev, "Failed to init hws actions pool\n");
+		mlx5hws_context_close(ns->fs_hws_context.hws_ctx);
+		return err;
+	}
+	return 0;
+}
+
+static int mlx5_cmd_hws_destroy_ns(struct mlx5_flow_root_namespace *ns)
+{
+	mlx5_fs_cleanup_hws_actions_pool(&ns->fs_hws_context);
+	return mlx5hws_context_close(ns->fs_hws_context.hws_ctx);
+}
+
+static int mlx5_cmd_hws_set_peer(struct mlx5_flow_root_namespace *ns,
+				 struct mlx5_flow_root_namespace *peer_ns,
+				 u16 peer_vhca_id)
+{
+	struct mlx5hws_context *peer_ctx = NULL;
+
+	if (peer_ns)
+		peer_ctx = peer_ns->fs_hws_context.hws_ctx;
+	mlx5hws_context_set_peer(ns->fs_hws_context.hws_ctx, peer_ctx,
+				 peer_vhca_id);
+	return 0;
+}
+
+static int mlx5_fs_set_ft_default_miss(struct mlx5_flow_root_namespace *ns,
+				       struct mlx5_flow_table *ft,
+				       struct mlx5_flow_table *next_ft)
+{
+	struct mlx5hws_table *next_tbl;
+	int err;
+
+	if (!ns->fs_hws_context.hws_ctx)
+		return -EINVAL;
+
+	/* if no change required, return */
+	if (!next_ft && !ft->fs_hws_table.miss_ft_set)
+		return 0;
+
+	next_tbl = next_ft ? next_ft->fs_hws_table.hws_table : NULL;
+	err = mlx5hws_table_set_default_miss(ft->fs_hws_table.hws_table, next_tbl);
+	if (err) {
+		mlx5_core_err(ns->dev, "Failed setting FT default miss (%d)\n", err);
+		return err;
+	}
+	ft->fs_hws_table.miss_ft_set = !!next_tbl;
+	return 0;
+}
+
+static int mlx5_fs_add_flow_table_dest_action(struct mlx5_flow_root_namespace *ns,
+					      struct mlx5_flow_table *ft)
+{
+	u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED;
+	struct mlx5_fs_hws_context *fs_ctx = &ns->fs_hws_context;
+	struct mlx5hws_action *dest_ft_action;
+	struct xarray *dests_xa;
+	int err;
+
+	dest_ft_action = mlx5hws_action_create_dest_table_num(fs_ctx->hws_ctx,
+							      ft->id, flags);
+	if (!dest_ft_action) {
+		mlx5_core_err(ns->dev, "Failed creating dest table action\n");
+		return -ENOMEM;
+	}
+
+	dests_xa = &fs_ctx->hws_pool.table_dests;
+	err = xa_insert(dests_xa, ft->id, dest_ft_action, GFP_KERNEL);
+	if (err)
+		mlx5hws_action_destroy(dest_ft_action);
+	return err;
+}
+
+static int mlx5_fs_del_flow_table_dest_action(struct mlx5_flow_root_namespace *ns,
+					      struct mlx5_flow_table *ft)
+{
+	struct mlx5_fs_hws_context *fs_ctx = &ns->fs_hws_context;
+	struct mlx5hws_action *dest_ft_action;
+	struct xarray *dests_xa;
+	int err;
+
+	dests_xa = &fs_ctx->hws_pool.table_dests;
+	dest_ft_action = xa_erase(dests_xa, ft->id);
+	if (!dest_ft_action) {
+		mlx5_core_err(ns->dev, "Failed to erase dest ft action\n");
+		return -ENOENT;
+	}
+
+	err = mlx5hws_action_destroy(dest_ft_action);
+	if (err)
+		mlx5_core_err(ns->dev, "Failed to destroy dest ft action\n");
+	return err;
+}
+
+static int mlx5_cmd_hws_create_flow_table(struct mlx5_flow_root_namespace *ns,
+					  struct mlx5_flow_table *ft,
+					  struct mlx5_flow_table_attr *ft_attr,
+					  struct mlx5_flow_table *next_ft)
+{
+	struct mlx5hws_context *ctx = ns->fs_hws_context.hws_ctx;
+	struct mlx5hws_table_attr tbl_attr = {};
+	struct mlx5hws_table *tbl;
+	int err;
+
+	if (mlx5_fs_cmd_is_fw_term_table(ft)) {
+		err = mlx5_fs_cmd_get_fw_cmds()->create_flow_table(ns, ft, ft_attr,
+								   next_ft);
+		if (err)
+			return err;
+		err = mlx5_fs_add_flow_table_dest_action(ns, ft);
+		if (err)
+			mlx5_fs_cmd_get_fw_cmds()->destroy_flow_table(ns, ft);
+		return err;
+	}
+
+	if (ns->table_type != FS_FT_FDB) {
+		mlx5_core_err(ns->dev, "Table type %d not supported for HWS\n",
+			      ns->table_type);
+		return -EOPNOTSUPP;
+	}
+
+	tbl_attr.type = MLX5HWS_TABLE_TYPE_FDB;
+	tbl_attr.level = ft_attr->level;
+	tbl = mlx5hws_table_create(ctx, &tbl_attr);
+	if (!tbl) {
+		mlx5_core_err(ns->dev, "Failed creating hws flow_table\n");
+		return -EINVAL;
+	}
+
+	ft->fs_hws_table.hws_table = tbl;
+	ft->id = mlx5hws_table_get_id(tbl);
+
+	if (next_ft) {
+		err = mlx5_fs_set_ft_default_miss(ns, ft, next_ft);
+		if (err)
+			goto destroy_table;
+	}
+
+	ft->max_fte = INT_MAX;
+
+	err = mlx5_fs_add_flow_table_dest_action(ns, ft);
+	if (err)
+		goto clear_ft_miss;
+	return 0;
+
+clear_ft_miss:
+	mlx5_fs_set_ft_default_miss(ns, ft, NULL);
+destroy_table:
+	mlx5hws_table_destroy(tbl);
+	ft->fs_hws_table.hws_table = NULL;
+	return err;
+}
+
+static int mlx5_cmd_hws_destroy_flow_table(struct mlx5_flow_root_namespace *ns,
+					   struct mlx5_flow_table *ft)
+{
+	int err;
+
+	err = mlx5_fs_del_flow_table_dest_action(ns, ft);
+	if (err)
+		mlx5_core_err(ns->dev, "Failed to remove dest action (%d)\n", err);
+
+	if (mlx5_fs_cmd_is_fw_term_table(ft))
+		return mlx5_fs_cmd_get_fw_cmds()->destroy_flow_table(ns, ft);
+
+	err = mlx5_fs_set_ft_default_miss(ns, ft, NULL);
+	if (err)
+		mlx5_core_err(ns->dev, "Failed to disconnect next table (%d)\n", err);
+
+	err = mlx5hws_table_destroy(ft->fs_hws_table.hws_table);
+	if (err)
+		mlx5_core_err(ns->dev, "Failed to destroy flow_table (%d)\n", err);
+
+	return err;
+}
+
+static int mlx5_cmd_hws_modify_flow_table(struct mlx5_flow_root_namespace *ns,
+					  struct mlx5_flow_table *ft,
+					  struct mlx5_flow_table *next_ft)
+{
+	if (mlx5_fs_cmd_is_fw_term_table(ft))
+		return mlx5_fs_cmd_get_fw_cmds()->modify_flow_table(ns, ft, next_ft);
+
+	return mlx5_fs_set_ft_default_miss(ns, ft, next_ft);
+}
+
+static int mlx5_cmd_hws_update_root_ft(struct mlx5_flow_root_namespace *ns,
+				       struct mlx5_flow_table *ft,
+				       u32 underlay_qpn,
+				       bool disconnect)
+{
+	return mlx5_fs_cmd_get_fw_cmds()->update_root_ft(ns, ft, underlay_qpn,
+							 disconnect);
+}
+
+static int mlx5_cmd_hws_create_flow_group(struct mlx5_flow_root_namespace *ns,
+					  struct mlx5_flow_table *ft, u32 *in,
+					  struct mlx5_flow_group *fg)
+{
+	struct mlx5hws_match_parameters mask;
+	struct mlx5hws_bwc_matcher *matcher;
+	u8 match_criteria_enable;
+	u32 priority;
+
+	if (mlx5_fs_cmd_is_fw_term_table(ft))
+		return mlx5_fs_cmd_get_fw_cmds()->create_flow_group(ns, ft, in, fg);
+
+	mask.match_buf = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
+	mask.match_sz = sizeof(fg->mask.match_criteria);
+
+	match_criteria_enable = MLX5_GET(create_flow_group_in, in,
+					 match_criteria_enable);
+	priority = MLX5_GET(create_flow_group_in, in, start_flow_index);
+	matcher = mlx5hws_bwc_matcher_create(ft->fs_hws_table.hws_table,
+					     priority, match_criteria_enable,
+					     &mask);
+	if (!matcher) {
+		mlx5_core_err(ns->dev, "Failed creating matcher\n");
+		return -EINVAL;
+	}
+
+	fg->fs_hws_matcher.matcher = matcher;
+	return 0;
+}
+
+static int mlx5_cmd_hws_destroy_flow_group(struct mlx5_flow_root_namespace *ns,
+					   struct mlx5_flow_table *ft,
+					   struct mlx5_flow_group *fg)
+{
+	if (mlx5_fs_cmd_is_fw_term_table(ft))
+		return mlx5_fs_cmd_get_fw_cmds()->destroy_flow_group(ns, ft, fg);
+
+	return mlx5hws_bwc_matcher_destroy(fg->fs_hws_matcher.matcher);
+}
+
+static struct mlx5hws_action *
+mlx5_fs_get_dest_action_ft(struct mlx5_fs_hws_context *fs_ctx,
+			   struct mlx5_flow_rule *dst)
+{
+	return xa_load(&fs_ctx->hws_pool.table_dests, dst->dest_attr.ft->id);
+}
+
+static struct mlx5hws_action *
+mlx5_fs_get_dest_action_table_num(struct mlx5_fs_hws_context *fs_ctx,
+				  struct mlx5_flow_rule *dst)
+{
+	u32 table_num = dst->dest_attr.ft_num;
+
+	return xa_load(&fs_ctx->hws_pool.table_dests, table_num);
+}
+
+static struct mlx5hws_action *
+mlx5_fs_create_dest_action_table_num(struct mlx5_fs_hws_context *fs_ctx,
+				     struct mlx5_flow_rule *dst)
+{
+	u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED;
+	struct mlx5hws_context *ctx = fs_ctx->hws_ctx;
+	u32 table_num = dst->dest_attr.ft_num;
+
+	return mlx5hws_action_create_dest_table_num(ctx, table_num, flags);
+}
+
+static struct mlx5hws_action *
+mlx5_fs_get_dest_action_vport(struct mlx5_fs_hws_context *fs_ctx,
+			      struct mlx5_flow_rule *dst,
+			      bool is_dest_type_uplink)
+{
+	u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED;
+	struct mlx5_flow_destination *dest_attr = &dst->dest_attr;
+	struct mlx5hws_context *ctx = fs_ctx->hws_ctx;
+	struct mlx5hws_action *dest;
+	struct xarray *dests_xa;
+	bool vhca_id_valid;
+	unsigned long idx;
+	u16 vport_num;
+	int err;
+
+	vhca_id_valid = is_dest_type_uplink ||
+			(dest_attr->vport.flags & MLX5_FLOW_DEST_VPORT_VHCA_ID);
+	vport_num = is_dest_type_uplink ? MLX5_VPORT_UPLINK : dest_attr->vport.num;
+	if (vhca_id_valid) {
+		dests_xa = &fs_ctx->hws_pool.vport_vhca_dests;
+		idx = dest_attr->vport.vhca_id << 16 | vport_num;
+	} else {
+		dests_xa = &fs_ctx->hws_pool.vport_dests;
+		idx = vport_num;
+	}
+dest_load:
+	dest = xa_load(dests_xa, idx);
+	if (dest)
+		return dest;
+
+	dest = mlx5hws_action_create_dest_vport(ctx, vport_num,	vhca_id_valid,
+						dest_attr->vport.vhca_id, flags);
+
+	err = xa_insert(dests_xa, idx, dest, GFP_KERNEL);
+	if (err) {
+		mlx5hws_action_destroy(dest);
+		dest = NULL;
+
+		if (err == -EBUSY)
+			/* xarray entry was already stored by another thread */
+			goto dest_load;
+	}
+
+	return dest;
+}
+
+static struct mlx5hws_action *
+mlx5_fs_create_dest_action_range(struct mlx5hws_context *ctx,
+				 struct mlx5_flow_rule *dst)
+{
+	u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED;
+	struct mlx5_flow_destination *dest_attr = &dst->dest_attr;
+
+	return mlx5hws_action_create_dest_match_range(ctx,
+						      dest_attr->range.field,
+						      dest_attr->range.hit_ft,
+						      dest_attr->range.miss_ft,
+						      dest_attr->range.min,
+						      dest_attr->range.max,
+						      flags);
+}
+
+static struct mlx5hws_action *
+mlx5_fs_create_action_dest_array(struct mlx5hws_context *ctx,
+				 struct mlx5hws_action_dest_attr *dests,
+				 u32 num_of_dests, bool ignore_flow_level,
+				 u32 flow_source)
+{
+	u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED;
+
+	return mlx5hws_action_create_dest_array(ctx, num_of_dests, dests,
+						ignore_flow_level,
+						flow_source, flags);
+}
+
+static struct mlx5hws_action *
+mlx5_fs_get_action_push_vlan(struct mlx5_fs_hws_context *fs_ctx)
+{
+	return fs_ctx->hws_pool.push_vlan_action;
+}
+
+static u32 mlx5_fs_calc_vlan_hdr(struct mlx5_fs_vlan *vlan)
+{
+	u16 n_ethtype = vlan->ethtype;
+	u8 prio = vlan->prio;
+	u16 vid = vlan->vid;
+
+	return (u32)n_ethtype << 16 | (u32)(prio) << 12 | (u32)vid;
+}
+
+static struct mlx5hws_action *
+mlx5_fs_get_action_pop_vlan(struct mlx5_fs_hws_context *fs_ctx)
+{
+	return fs_ctx->hws_pool.pop_vlan_action;
+}
+
+static struct mlx5hws_action *
+mlx5_fs_get_action_decap_tnl_l2_to_l2(struct mlx5_fs_hws_context *fs_ctx)
+{
+	return fs_ctx->hws_pool.decapl2_action;
+}
+
+static struct mlx5hws_action *
+mlx5_fs_get_dest_action_drop(struct mlx5_fs_hws_context *fs_ctx)
+{
+	return fs_ctx->hws_pool.drop_action;
+}
+
+static struct mlx5hws_action *
+mlx5_fs_get_action_tag(struct mlx5_fs_hws_context *fs_ctx)
+{
+	return fs_ctx->hws_pool.tag_action;
+}
+
+static struct mlx5hws_action *
+mlx5_fs_create_action_last(struct mlx5hws_context *ctx)
+{
+	u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED;
+
+	return mlx5hws_action_create_last(ctx, flags);
+}
+
+static void mlx5_fs_destroy_fs_action(struct mlx5_fs_hws_rule_action *fs_action)
+{
+	switch (mlx5hws_action_get_type(fs_action->action)) {
+	case MLX5HWS_ACTION_TYP_CTR:
+		mlx5_fc_put_hws_action(fs_action->counter);
+		break;
+	default:
+		mlx5hws_action_destroy(fs_action->action);
+	}
+}
+
+static void
+mlx5_fs_destroy_fs_actions(struct mlx5_fs_hws_rule_action **fs_actions,
+			   int *num_fs_actions)
+{
+	int i;
+
+	/* Free in reverse order to handle action dependencies */
+	for (i = *num_fs_actions - 1; i >= 0; i--)
+		mlx5_fs_destroy_fs_action(*fs_actions + i);
+	*num_fs_actions = 0;
+	kfree(*fs_actions);
+	*fs_actions = NULL;
+}
+
+/* Splits FTE's actions into cached, rule and destination actions.
+ * The cached and destination actions are saved on the fte hws rule.
+ * The rule actions are returned as a parameter, together with their count.
+ * We want to support a rule with 32 destinations, which means we need to
+ * account for 32 destinations plus usually a counter plus one more action
+ * for a multi-destination flow table.
+ * 32 is SW limitation for array size, keep. HWS limitation is 16M STEs per matcher
+ */
+#define MLX5_FLOW_CONTEXT_ACTION_MAX 34
+static int mlx5_fs_fte_get_hws_actions(struct mlx5_flow_root_namespace *ns,
+				       struct mlx5_flow_table *ft,
+				       struct mlx5_flow_group *group,
+				       struct fs_fte *fte,
+				       struct mlx5hws_rule_action **ractions)
+{
+	struct mlx5_flow_act *fte_action = &fte->act_dests.action;
+	struct mlx5_fs_hws_context *fs_ctx = &ns->fs_hws_context;
+	struct mlx5hws_action_dest_attr *dest_actions;
+	struct mlx5hws_context *ctx = fs_ctx->hws_ctx;
+	struct mlx5_fs_hws_rule_action *fs_actions;
+	struct mlx5_core_dev *dev = ns->dev;
+	struct mlx5hws_action *dest_action;
+	struct mlx5hws_action *tmp_action;
+	struct mlx5_fs_hws_pr *pr_data;
+	struct mlx5_fs_hws_mh *mh_data;
+	bool delay_encap_set = false;
+	struct mlx5_flow_rule *dst;
+	int num_dest_actions = 0;
+	int num_fs_actions = 0;
+	int num_actions = 0;
+	int err;
+
+	*ractions = kcalloc(MLX5_FLOW_CONTEXT_ACTION_MAX, sizeof(**ractions),
+			    GFP_KERNEL);
+	if (!*ractions) {
+		err = -ENOMEM;
+		goto out_err;
+	}
+
+	fs_actions = kcalloc(MLX5_FLOW_CONTEXT_ACTION_MAX,
+			     sizeof(*fs_actions), GFP_KERNEL);
+	if (!fs_actions) {
+		err = -ENOMEM;
+		goto free_actions_alloc;
+	}
+
+	dest_actions = kcalloc(MLX5_FLOW_CONTEXT_ACTION_MAX,
+			       sizeof(*dest_actions), GFP_KERNEL);
+	if (!dest_actions) {
+		err = -ENOMEM;
+		goto free_fs_actions_alloc;
+	}
+
+	/* The order of the actions are must to be kept, only the following
+	 * order is supported by HW steering:
+	 * HWS: decap -> remove_hdr -> pop_vlan -> modify header -> push_vlan
+	 *      -> reformat (insert_hdr/encap) -> ctr -> tag -> aso
+	 *      -> drop -> FWD:tbl/vport/sampler/tbl_num/range -> dest_array -> last
+	 */
+	if (fte_action->action & MLX5_FLOW_CONTEXT_ACTION_DECAP) {
+		tmp_action = mlx5_fs_get_action_decap_tnl_l2_to_l2(fs_ctx);
+		if (!tmp_action) {
+			err = -ENOMEM;
+			goto free_dest_actions_alloc;
+		}
+		(*ractions)[num_actions++].action = tmp_action;
+	}
+
+	if (fte_action->action & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT) {
+		int reformat_type = fte_action->pkt_reformat->reformat_type;
+
+		if (fte_action->pkt_reformat->owner == MLX5_FLOW_RESOURCE_OWNER_FW) {
+			mlx5_core_err(dev, "FW-owned reformat can't be used in HWS rule\n");
+			err = -EINVAL;
+			goto free_actions;
+		}
+
+		if (reformat_type == MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2) {
+			pr_data = fte_action->pkt_reformat->fs_hws_action.pr_data;
+			(*ractions)[num_actions].reformat.offset = pr_data->offset;
+			(*ractions)[num_actions].reformat.hdr_idx = pr_data->hdr_idx;
+			(*ractions)[num_actions].reformat.data = pr_data->data;
+			(*ractions)[num_actions++].action =
+				fte_action->pkt_reformat->fs_hws_action.hws_action;
+		} else if (reformat_type == MLX5_REFORMAT_TYPE_REMOVE_HDR) {
+			(*ractions)[num_actions++].action =
+				fte_action->pkt_reformat->fs_hws_action.hws_action;
+		} else {
+			delay_encap_set = true;
+		}
+	}
+
+	if (fte_action->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_POP) {
+		tmp_action = mlx5_fs_get_action_pop_vlan(fs_ctx);
+		if (!tmp_action) {
+			err = -ENOMEM;
+			goto free_actions;
+		}
+		(*ractions)[num_actions++].action = tmp_action;
+	}
+
+	if (fte_action->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_POP_2) {
+		tmp_action = mlx5_fs_get_action_pop_vlan(fs_ctx);
+		if (!tmp_action) {
+			err = -ENOMEM;
+			goto free_actions;
+		}
+		(*ractions)[num_actions++].action = tmp_action;
+	}
+
+	if (fte_action->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) {
+		mh_data = fte_action->modify_hdr->fs_hws_action.mh_data;
+		(*ractions)[num_actions].modify_header.offset = mh_data->offset;
+		(*ractions)[num_actions].modify_header.data = mh_data->data;
+		(*ractions)[num_actions++].action =
+			fte_action->modify_hdr->fs_hws_action.hws_action;
+	}
+
+	if (fte_action->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH) {
+		tmp_action = mlx5_fs_get_action_push_vlan(fs_ctx);
+		if (!tmp_action) {
+			err = -ENOMEM;
+			goto free_actions;
+		}
+		(*ractions)[num_actions].push_vlan.vlan_hdr =
+			htonl(mlx5_fs_calc_vlan_hdr(&fte_action->vlan[0]));
+		(*ractions)[num_actions++].action = tmp_action;
+	}
+
+	if (fte_action->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2) {
+		tmp_action = mlx5_fs_get_action_push_vlan(fs_ctx);
+		if (!tmp_action) {
+			err = -ENOMEM;
+			goto free_actions;
+		}
+		(*ractions)[num_actions].push_vlan.vlan_hdr =
+			htonl(mlx5_fs_calc_vlan_hdr(&fte_action->vlan[1]));
+		(*ractions)[num_actions++].action = tmp_action;
+	}
+
+	if (delay_encap_set) {
+		pr_data = fte_action->pkt_reformat->fs_hws_action.pr_data;
+		(*ractions)[num_actions].reformat.offset = pr_data->offset;
+		(*ractions)[num_actions].reformat.data = pr_data->data;
+		(*ractions)[num_actions++].action =
+			fte_action->pkt_reformat->fs_hws_action.hws_action;
+	}
+
+	if (fte_action->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+		list_for_each_entry(dst, &fte->node.children, node.list) {
+			struct mlx5_fc *counter;
+
+			if (dst->dest_attr.type !=
+			    MLX5_FLOW_DESTINATION_TYPE_COUNTER)
+				continue;
+
+			if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) {
+				err = -EOPNOTSUPP;
+				goto free_actions;
+			}
+
+			counter = dst->dest_attr.counter;
+			tmp_action = mlx5_fc_get_hws_action(ctx, counter);
+			if (!tmp_action) {
+				err = -EINVAL;
+				goto free_actions;
+			}
+
+			(*ractions)[num_actions].counter.offset =
+				mlx5_fc_id(counter) - mlx5_fc_get_base_id(counter);
+			(*ractions)[num_actions++].action = tmp_action;
+			fs_actions[num_fs_actions].action = tmp_action;
+			fs_actions[num_fs_actions++].counter = counter;
+		}
+	}
+
+	if (fte->act_dests.flow_context.flow_tag) {
+		if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) {
+			err = -EOPNOTSUPP;
+			goto free_actions;
+		}
+		tmp_action = mlx5_fs_get_action_tag(fs_ctx);
+		if (!tmp_action) {
+			err = -ENOMEM;
+			goto free_actions;
+		}
+		(*ractions)[num_actions].tag.value = fte->act_dests.flow_context.flow_tag;
+		(*ractions)[num_actions++].action = tmp_action;
+	}
+
+	if (fte_action->action & MLX5_FLOW_CONTEXT_ACTION_EXECUTE_ASO) {
+		err = -EOPNOTSUPP;
+		goto free_actions;
+	}
+
+	if (fte_action->action & MLX5_FLOW_CONTEXT_ACTION_DROP) {
+		dest_action = mlx5_fs_get_dest_action_drop(fs_ctx);
+		if (!dest_action) {
+			err = -ENOMEM;
+			goto free_actions;
+		}
+		dest_actions[num_dest_actions++].dest = dest_action;
+	}
+
+	if (fte_action->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
+		list_for_each_entry(dst, &fte->node.children, node.list) {
+			struct mlx5_flow_destination *attr = &dst->dest_attr;
+			bool type_uplink =
+				attr->type == MLX5_FLOW_DESTINATION_TYPE_UPLINK;
+
+			if (num_fs_actions == MLX5_FLOW_CONTEXT_ACTION_MAX ||
+			    num_dest_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) {
+				err = -EOPNOTSUPP;
+				goto free_actions;
+			}
+			if (attr->type == MLX5_FLOW_DESTINATION_TYPE_COUNTER)
+				continue;
+
+			switch (attr->type) {
+			case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE:
+				dest_action = mlx5_fs_get_dest_action_ft(fs_ctx, dst);
+				break;
+			case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM:
+				dest_action = mlx5_fs_get_dest_action_table_num(fs_ctx,
+										dst);
+				if (dest_action)
+					break;
+				dest_action = mlx5_fs_create_dest_action_table_num(fs_ctx,
+										   dst);
+				fs_actions[num_fs_actions++].action = dest_action;
+				break;
+			case MLX5_FLOW_DESTINATION_TYPE_RANGE:
+				dest_action = mlx5_fs_create_dest_action_range(ctx, dst);
+				fs_actions[num_fs_actions++].action = dest_action;
+				break;
+			case MLX5_FLOW_DESTINATION_TYPE_UPLINK:
+			case MLX5_FLOW_DESTINATION_TYPE_VPORT:
+				dest_action = mlx5_fs_get_dest_action_vport(fs_ctx, dst,
+									    type_uplink);
+				break;
+			default:
+				err = -EOPNOTSUPP;
+				goto free_actions;
+			}
+			if (!dest_action) {
+				err = -ENOMEM;
+				goto free_actions;
+			}
+			dest_actions[num_dest_actions++].dest = dest_action;
+		}
+	}
+
+	if (num_dest_actions == 1) {
+		if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) {
+			err = -EOPNOTSUPP;
+			goto free_actions;
+		}
+		(*ractions)[num_actions++].action = dest_actions->dest;
+	} else if (num_dest_actions > 1) {
+		u32 flow_source = fte->act_dests.flow_context.flow_source;
+		bool ignore_flow_level;
+
+		if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX ||
+		    num_fs_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) {
+			err = -EOPNOTSUPP;
+			goto free_actions;
+		}
+		ignore_flow_level =
+			!!(fte_action->flags & FLOW_ACT_IGNORE_FLOW_LEVEL);
+		tmp_action = mlx5_fs_create_action_dest_array(ctx, dest_actions,
+							      num_dest_actions,
+							      ignore_flow_level,
+							      flow_source);
+		if (!tmp_action) {
+			err = -EOPNOTSUPP;
+			goto free_actions;
+		}
+		fs_actions[num_fs_actions++].action = tmp_action;
+		(*ractions)[num_actions++].action = tmp_action;
+	}
+
+	if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX ||
+	    num_fs_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) {
+		err = -EOPNOTSUPP;
+		goto free_actions;
+	}
+
+	tmp_action = mlx5_fs_create_action_last(ctx);
+	if (!tmp_action) {
+		err = -ENOMEM;
+		goto free_actions;
+	}
+	fs_actions[num_fs_actions++].action = tmp_action;
+	(*ractions)[num_actions++].action = tmp_action;
+
+	kfree(dest_actions);
+
+	/* Actions created specifically for this rule will be destroyed
+	 * once rule is deleted.
+	 */
+	fte->fs_hws_rule.num_fs_actions = num_fs_actions;
+	fte->fs_hws_rule.hws_fs_actions = fs_actions;
+
+	return 0;
+
+free_actions:
+	mlx5_fs_destroy_fs_actions(&fs_actions, &num_fs_actions);
+free_dest_actions_alloc:
+	kfree(dest_actions);
+free_fs_actions_alloc:
+	kfree(fs_actions);
+free_actions_alloc:
+	kfree(*ractions);
+	*ractions = NULL;
+out_err:
+	return err;
+}
+
+static int mlx5_cmd_hws_create_fte(struct mlx5_flow_root_namespace *ns,
+				   struct mlx5_flow_table *ft,
+				   struct mlx5_flow_group *group,
+				   struct fs_fte *fte)
+{
+	struct mlx5hws_match_parameters params;
+	struct mlx5hws_rule_action *ractions;
+	struct mlx5hws_bwc_rule *rule;
+	int err = 0;
+
+	if (mlx5_fs_cmd_is_fw_term_table(ft)) {
+		/* Packet reformat on terminamtion table not supported yet */
+		if (fte->act_dests.action.action &
+		    MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT)
+			return -EOPNOTSUPP;
+		return mlx5_fs_cmd_get_fw_cmds()->create_fte(ns, ft, group, fte);
+	}
+
+	err = mlx5_fs_fte_get_hws_actions(ns, ft, group, fte, &ractions);
+	if (err)
+		goto out_err;
+
+	params.match_sz = sizeof(fte->val);
+	params.match_buf = fte->val;
+
+	rule = mlx5hws_bwc_rule_create(group->fs_hws_matcher.matcher, &params,
+				       fte->act_dests.flow_context.flow_source,
+				       ractions);
+	kfree(ractions);
+	if (!rule) {
+		err = -EINVAL;
+		goto free_actions;
+	}
+
+	fte->fs_hws_rule.bwc_rule = rule;
+	return 0;
+
+free_actions:
+	mlx5_fs_destroy_fs_actions(&fte->fs_hws_rule.hws_fs_actions,
+				   &fte->fs_hws_rule.num_fs_actions);
+out_err:
+	mlx5_core_err(ns->dev, "Failed to create hws rule err(%d)\n", err);
+	return err;
+}
+
+static int mlx5_cmd_hws_delete_fte(struct mlx5_flow_root_namespace *ns,
+				   struct mlx5_flow_table *ft,
+				   struct fs_fte *fte)
+{
+	struct mlx5_fs_hws_rule *rule = &fte->fs_hws_rule;
+	int err;
+
+	if (mlx5_fs_cmd_is_fw_term_table(ft))
+		return mlx5_fs_cmd_get_fw_cmds()->delete_fte(ns, ft, fte);
+
+	err = mlx5hws_bwc_rule_destroy(rule->bwc_rule);
+	rule->bwc_rule = NULL;
+
+	mlx5_fs_destroy_fs_actions(&rule->hws_fs_actions, &rule->num_fs_actions);
+
+	return err;
+}
+
+static int mlx5_cmd_hws_update_fte(struct mlx5_flow_root_namespace *ns,
+				   struct mlx5_flow_table *ft,
+				   struct mlx5_flow_group *group,
+				   int modify_mask,
+				   struct fs_fte *fte)
+{
+	int allowed_mask = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION) |
+		BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST) |
+		BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS);
+	struct mlx5_fs_hws_rule_action *saved_hws_fs_actions;
+	struct mlx5hws_rule_action *ractions;
+	int saved_num_fs_actions;
+	int ret;
+
+	if (mlx5_fs_cmd_is_fw_term_table(ft))
+		return mlx5_fs_cmd_get_fw_cmds()->update_fte(ns, ft, group,
+							     modify_mask, fte);
+
+	if ((modify_mask & ~allowed_mask) != 0)
+		return -EINVAL;
+
+	saved_hws_fs_actions = fte->fs_hws_rule.hws_fs_actions;
+	saved_num_fs_actions = fte->fs_hws_rule.num_fs_actions;
+
+	ret = mlx5_fs_fte_get_hws_actions(ns, ft, group, fte, &ractions);
+	if (ret)
+		return ret;
+
+	ret = mlx5hws_bwc_rule_action_update(fte->fs_hws_rule.bwc_rule, ractions);
+	kfree(ractions);
+	if (ret)
+		goto restore_actions;
+
+	mlx5_fs_destroy_fs_actions(&saved_hws_fs_actions, &saved_num_fs_actions);
+	return ret;
+
+restore_actions:
+	mlx5_fs_destroy_fs_actions(&fte->fs_hws_rule.hws_fs_actions,
+				   &fte->fs_hws_rule.num_fs_actions);
+	fte->fs_hws_rule.hws_fs_actions = saved_hws_fs_actions;
+	fte->fs_hws_rule.num_fs_actions = saved_num_fs_actions;
+	return ret;
+}
+
+static struct mlx5hws_action *
+mlx5_fs_create_action_remove_header_vlan(struct mlx5hws_context *ctx)
+{
+	u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED;
+	struct mlx5hws_action_remove_header_attr remove_hdr_vlan = {};
+
+	/* MAC anchor not supported in HWS reformat, use VLAN anchor */
+	remove_hdr_vlan.anchor = MLX5_REFORMAT_CONTEXT_ANCHOR_VLAN_START;
+	remove_hdr_vlan.offset = 0;
+	remove_hdr_vlan.size = sizeof(struct vlan_hdr);
+	return mlx5hws_action_create_remove_header(ctx, &remove_hdr_vlan, flags);
+}
+
+static struct mlx5hws_action *
+mlx5_fs_get_action_remove_header_vlan(struct mlx5_fs_hws_context *fs_ctx,
+				      struct mlx5_pkt_reformat_params *params)
+{
+	if (!params ||
+	    params->param_0 != MLX5_REFORMAT_CONTEXT_ANCHOR_MAC_START ||
+	    params->param_1 != offsetof(struct vlan_ethhdr, h_vlan_proto) ||
+	    params->size != sizeof(struct vlan_hdr))
+		return NULL;
+
+	return fs_ctx->hws_pool.remove_hdr_vlan_action;
+}
+
+static int
+mlx5_fs_verify_insert_header_params(struct mlx5_core_dev *mdev,
+				    struct mlx5_pkt_reformat_params *params)
+{
+	if ((!params->data && params->size) || (params->data && !params->size) ||
+	    MLX5_CAP_GEN_2(mdev, max_reformat_insert_size) < params->size ||
+	    MLX5_CAP_GEN_2(mdev, max_reformat_insert_offset) < params->param_1) {
+		mlx5_core_err(mdev, "Invalid reformat params for INSERT_HDR\n");
+		return -EINVAL;
+	}
+	if (params->param_0 != MLX5_FS_INSERT_HDR_VLAN_ANCHOR ||
+	    params->param_1 != MLX5_FS_INSERT_HDR_VLAN_OFFSET ||
+	    params->size != MLX5_FS_INSERT_HDR_VLAN_SIZE) {
+		mlx5_core_err(mdev, "Only vlan insert header supported\n");
+		return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
+static int
+mlx5_fs_verify_encap_decap_params(struct mlx5_core_dev *dev,
+				  struct mlx5_pkt_reformat_params *params)
+{
+	if (params->param_0 || params->param_1) {
+		mlx5_core_err(dev, "Invalid reformat params\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct mlx5_fs_pool *
+mlx5_fs_get_pr_encap_pool(struct mlx5_core_dev *dev, struct xarray *pr_pools,
+			  enum mlx5hws_action_type reformat_type, size_t size)
+{
+	struct mlx5_fs_pool *pr_pool;
+	unsigned long index = size;
+	int err;
+
+	pr_pool = xa_load(pr_pools, index);
+	if (pr_pool)
+		return pr_pool;
+
+	pr_pool = kzalloc(sizeof(*pr_pool), GFP_KERNEL);
+	if (!pr_pool)
+		return ERR_PTR(-ENOMEM);
+	err = mlx5_fs_hws_pr_pool_init(pr_pool, dev, size, reformat_type);
+	if (err)
+		goto free_pr_pool;
+	err = xa_insert(pr_pools, index, pr_pool, GFP_KERNEL);
+	if (err)
+		goto cleanup_pr_pool;
+	return pr_pool;
+
+cleanup_pr_pool:
+	mlx5_fs_hws_pr_pool_cleanup(pr_pool);
+free_pr_pool:
+	kfree(pr_pool);
+	return ERR_PTR(err);
+}
+
+static void
+mlx5_fs_destroy_pr_pool(struct mlx5_fs_pool *pool, struct xarray *pr_pools,
+			unsigned long index)
+{
+	xa_erase(pr_pools, index);
+	mlx5_fs_hws_pr_pool_cleanup(pool);
+	kfree(pool);
+}
+
+static int
+mlx5_cmd_hws_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns,
+				   struct mlx5_pkt_reformat_params *params,
+				   enum mlx5_flow_namespace_type namespace,
+				   struct mlx5_pkt_reformat *pkt_reformat)
+{
+	struct mlx5_fs_hws_context *fs_ctx = &ns->fs_hws_context;
+	struct mlx5_fs_hws_actions_pool *hws_pool;
+	struct mlx5hws_action *hws_action = NULL;
+	struct mlx5_fs_hws_pr *pr_data = NULL;
+	struct mlx5_fs_pool *pr_pool = NULL;
+	struct mlx5_core_dev *dev = ns->dev;
+	u8 hdr_idx = 0;
+	int err;
+
+	if (!params)
+		return -EINVAL;
+
+	hws_pool = &fs_ctx->hws_pool;
+
+	switch (params->type) {
+	case MLX5_REFORMAT_TYPE_L2_TO_VXLAN:
+	case MLX5_REFORMAT_TYPE_L2_TO_NVGRE:
+	case MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL:
+		if (mlx5_fs_verify_encap_decap_params(dev, params))
+			return -EINVAL;
+		pr_pool = mlx5_fs_get_pr_encap_pool(dev, &hws_pool->el2tol2tnl_pools,
+						    MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L2,
+						    params->size);
+		if (IS_ERR(pr_pool))
+			return PTR_ERR(pr_pool);
+		break;
+	case MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL:
+		if (mlx5_fs_verify_encap_decap_params(dev, params))
+			return -EINVAL;
+		pr_pool = mlx5_fs_get_pr_encap_pool(dev, &hws_pool->el2tol3tnl_pools,
+						    MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L3,
+						    params->size);
+		if (IS_ERR(pr_pool))
+			return PTR_ERR(pr_pool);
+		break;
+	case MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2:
+		if (mlx5_fs_verify_encap_decap_params(dev, params))
+			return -EINVAL;
+		pr_pool = &hws_pool->dl3tnltol2_pool;
+		hdr_idx = params->size == ETH_HLEN ?
+			  MLX5_FS_DL3TNLTOL2_MAC_HDR_IDX :
+			  MLX5_FS_DL3TNLTOL2_MAC_VLAN_HDR_IDX;
+		break;
+	case MLX5_REFORMAT_TYPE_INSERT_HDR:
+		err = mlx5_fs_verify_insert_header_params(dev, params);
+		if (err)
+			return err;
+		pr_pool = &hws_pool->insert_hdr_pool;
+		break;
+	case MLX5_REFORMAT_TYPE_REMOVE_HDR:
+		hws_action = mlx5_fs_get_action_remove_header_vlan(fs_ctx, params);
+		if (!hws_action)
+			mlx5_core_err(dev, "Only vlan remove header supported\n");
+		break;
+	default:
+		mlx5_core_err(ns->dev, "Packet-reformat not supported(%d)\n",
+			      params->type);
+		return -EOPNOTSUPP;
+	}
+
+	if (pr_pool) {
+		pr_data = mlx5_fs_hws_pr_pool_acquire_pr(pr_pool);
+		if (IS_ERR_OR_NULL(pr_data))
+			return !pr_data ? -EINVAL : PTR_ERR(pr_data);
+		hws_action = pr_data->bulk->hws_action;
+		if (!hws_action) {
+			mlx5_core_err(dev,
+				      "Failed allocating packet-reformat action\n");
+			err = -EINVAL;
+			goto release_pr;
+		}
+		pr_data->data = kmemdup(params->data, params->size, GFP_KERNEL);
+		if (!pr_data->data) {
+			err = -ENOMEM;
+			goto release_pr;
+		}
+		pr_data->hdr_idx = hdr_idx;
+		pr_data->data_size = params->size;
+		pkt_reformat->fs_hws_action.pr_data = pr_data;
+	}
+
+	pkt_reformat->owner = MLX5_FLOW_RESOURCE_OWNER_SW;
+	pkt_reformat->fs_hws_action.hws_action = hws_action;
+	return 0;
+
+release_pr:
+	if (pr_pool && pr_data)
+		mlx5_fs_hws_pr_pool_release_pr(pr_pool, pr_data);
+	return err;
+}
+
+static void mlx5_cmd_hws_packet_reformat_dealloc(struct mlx5_flow_root_namespace *ns,
+						 struct mlx5_pkt_reformat *pkt_reformat)
+{
+	struct mlx5_fs_hws_actions_pool *hws_pool = &ns->fs_hws_context.hws_pool;
+	struct mlx5_core_dev *dev = ns->dev;
+	struct mlx5_fs_hws_pr *pr_data;
+	struct mlx5_fs_pool *pr_pool;
+
+	if (pkt_reformat->reformat_type == MLX5_REFORMAT_TYPE_REMOVE_HDR)
+		return;
+
+	if (!pkt_reformat->fs_hws_action.pr_data) {
+		mlx5_core_err(ns->dev, "Failed release packet-reformat\n");
+		return;
+	}
+	pr_data = pkt_reformat->fs_hws_action.pr_data;
+
+	switch (pkt_reformat->reformat_type) {
+	case MLX5_REFORMAT_TYPE_L2_TO_VXLAN:
+	case MLX5_REFORMAT_TYPE_L2_TO_NVGRE:
+	case MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL:
+		pr_pool = mlx5_fs_get_pr_encap_pool(dev, &hws_pool->el2tol2tnl_pools,
+						    MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L2,
+						    pr_data->data_size);
+		break;
+	case MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL:
+		pr_pool = mlx5_fs_get_pr_encap_pool(dev, &hws_pool->el2tol2tnl_pools,
+						    MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L2,
+						    pr_data->data_size);
+		break;
+	case MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2:
+		pr_pool = &hws_pool->dl3tnltol2_pool;
+		break;
+	case MLX5_REFORMAT_TYPE_INSERT_HDR:
+		pr_pool = &hws_pool->insert_hdr_pool;
+		break;
+	default:
+		mlx5_core_err(ns->dev, "Unknown packet-reformat type\n");
+		return;
+	}
+	if (!pkt_reformat->fs_hws_action.pr_data || IS_ERR(pr_pool)) {
+		mlx5_core_err(ns->dev, "Failed release packet-reformat\n");
+		return;
+	}
+	kfree(pr_data->data);
+	mlx5_fs_hws_pr_pool_release_pr(pr_pool, pr_data);
+	pkt_reformat->fs_hws_action.pr_data = NULL;
+}
+
+static struct mlx5_fs_pool *
+mlx5_fs_create_mh_pool(struct mlx5_core_dev *dev,
+		       struct mlx5hws_action_mh_pattern *pattern,
+		       struct xarray *mh_pools, unsigned long index)
+{
+	struct mlx5_fs_pool *pool;
+	int err;
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool)
+		return ERR_PTR(-ENOMEM);
+	err = mlx5_fs_hws_mh_pool_init(pool, dev, pattern);
+	if (err)
+		goto free_pool;
+	err = xa_insert(mh_pools, index, pool, GFP_KERNEL);
+	if (err)
+		goto cleanup_pool;
+	return pool;
+
+cleanup_pool:
+	mlx5_fs_hws_mh_pool_cleanup(pool);
+free_pool:
+	kfree(pool);
+	return ERR_PTR(err);
+}
+
+static void
+mlx5_fs_destroy_mh_pool(struct mlx5_fs_pool *pool, struct xarray *mh_pools,
+			unsigned long index)
+{
+	xa_erase(mh_pools, index);
+	mlx5_fs_hws_mh_pool_cleanup(pool);
+	kfree(pool);
+}
+
+static int mlx5_cmd_hws_modify_header_alloc(struct mlx5_flow_root_namespace *ns,
+					    u8 namespace, u8 num_actions,
+					    void *modify_actions,
+					    struct mlx5_modify_hdr *modify_hdr)
+{
+	struct mlx5_fs_hws_actions_pool *hws_pool = &ns->fs_hws_context.hws_pool;
+	struct mlx5hws_action_mh_pattern pattern = {};
+	struct mlx5_fs_hws_mh *mh_data = NULL;
+	struct mlx5hws_action *hws_action;
+	struct mlx5_fs_pool *pool;
+	unsigned long i, cnt = 0;
+	bool known_pattern;
+	int err;
+
+	pattern.sz = MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto) * num_actions;
+	pattern.data = modify_actions;
+
+	known_pattern = false;
+	xa_for_each(&hws_pool->mh_pools, i, pool) {
+		if (mlx5_fs_hws_mh_pool_match(pool, &pattern)) {
+			known_pattern = true;
+			break;
+		}
+		cnt++;
+	}
+
+	if (!known_pattern) {
+		pool = mlx5_fs_create_mh_pool(ns->dev, &pattern,
+					      &hws_pool->mh_pools, cnt);
+		if (IS_ERR(pool))
+			return PTR_ERR(pool);
+	}
+	mh_data = mlx5_fs_hws_mh_pool_acquire_mh(pool);
+	if (IS_ERR(mh_data)) {
+		err = PTR_ERR(mh_data);
+		goto destroy_pool;
+	}
+	hws_action = mh_data->bulk->hws_action;
+	mh_data->data = kmemdup(pattern.data, pattern.sz, GFP_KERNEL);
+	if (!mh_data->data) {
+		err = -ENOMEM;
+		goto release_mh;
+	}
+	modify_hdr->fs_hws_action.mh_data = mh_data;
+	modify_hdr->fs_hws_action.fs_pool = pool;
+	modify_hdr->owner = MLX5_FLOW_RESOURCE_OWNER_SW;
+	modify_hdr->fs_hws_action.hws_action = hws_action;
+
+	return 0;
+
+release_mh:
+	mlx5_fs_hws_mh_pool_release_mh(pool, mh_data);
+destroy_pool:
+	if (!known_pattern)
+		mlx5_fs_destroy_mh_pool(pool, &hws_pool->mh_pools, cnt);
+	return err;
+}
+
+static void mlx5_cmd_hws_modify_header_dealloc(struct mlx5_flow_root_namespace *ns,
+					       struct mlx5_modify_hdr *modify_hdr)
+{
+	struct mlx5_fs_hws_mh *mh_data;
+	struct mlx5_fs_pool *pool;
+
+	if (!modify_hdr->fs_hws_action.fs_pool || !modify_hdr->fs_hws_action.mh_data) {
+		mlx5_core_err(ns->dev, "Failed release modify-header\n");
+		return;
+	}
+
+	mh_data = modify_hdr->fs_hws_action.mh_data;
+	kfree(mh_data->data);
+	pool = modify_hdr->fs_hws_action.fs_pool;
+	mlx5_fs_hws_mh_pool_release_mh(pool, mh_data);
+	modify_hdr->fs_hws_action.mh_data = NULL;
+}
+
+static int mlx5_cmd_hws_create_match_definer(struct mlx5_flow_root_namespace *ns,
+					     u16 format_id, u32 *match_mask)
+{
+	return -EOPNOTSUPP;
+}
+
+static int mlx5_cmd_hws_destroy_match_definer(struct mlx5_flow_root_namespace *ns,
+					      int definer_id)
+{
+	return -EOPNOTSUPP;
+}
+
+static u32 mlx5_cmd_hws_get_capabilities(struct mlx5_flow_root_namespace *ns,
+					 enum fs_flow_table_type ft_type)
+{
+	if (ft_type != FS_FT_FDB)
+		return 0;
+
+	return MLX5_FLOW_STEERING_CAP_VLAN_PUSH_ON_RX |
+	       MLX5_FLOW_STEERING_CAP_VLAN_POP_ON_TX |
+	       MLX5_FLOW_STEERING_CAP_MATCH_RANGES;
+}
+
+bool mlx5_fs_hws_is_supported(struct mlx5_core_dev *dev)
+{
+	return mlx5hws_is_supported(dev);
+}
+
+static const struct mlx5_flow_cmds mlx5_flow_cmds_hws = {
+	.create_flow_table = mlx5_cmd_hws_create_flow_table,
+	.destroy_flow_table = mlx5_cmd_hws_destroy_flow_table,
+	.modify_flow_table = mlx5_cmd_hws_modify_flow_table,
+	.update_root_ft = mlx5_cmd_hws_update_root_ft,
+	.create_flow_group = mlx5_cmd_hws_create_flow_group,
+	.destroy_flow_group = mlx5_cmd_hws_destroy_flow_group,
+	.create_fte = mlx5_cmd_hws_create_fte,
+	.delete_fte = mlx5_cmd_hws_delete_fte,
+	.update_fte = mlx5_cmd_hws_update_fte,
+	.packet_reformat_alloc = mlx5_cmd_hws_packet_reformat_alloc,
+	.packet_reformat_dealloc = mlx5_cmd_hws_packet_reformat_dealloc,
+	.modify_header_alloc = mlx5_cmd_hws_modify_header_alloc,
+	.modify_header_dealloc = mlx5_cmd_hws_modify_header_dealloc,
+	.create_match_definer = mlx5_cmd_hws_create_match_definer,
+	.destroy_match_definer = mlx5_cmd_hws_destroy_match_definer,
+	.create_ns = mlx5_cmd_hws_create_ns,
+	.destroy_ns = mlx5_cmd_hws_destroy_ns,
+	.set_peer = mlx5_cmd_hws_set_peer,
+	.get_capabilities = mlx5_cmd_hws_get_capabilities,
+};
+
+const struct mlx5_flow_cmds *mlx5_fs_cmd_get_hws_cmds(void)
+{
+	return &mlx5_flow_cmds_hws;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.h
new file mode 100644
index 000000000000..cbddb72d4362
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2025 NVIDIA Corporation & Affiliates */
+
+#ifndef _MLX5_FS_HWS_
+#define _MLX5_FS_HWS_
+
+#include "mlx5hws.h"
+#include "fs_hws_pools.h"
+
+struct mlx5_fs_hws_actions_pool {
+	struct mlx5hws_action *tag_action;
+	struct mlx5hws_action *pop_vlan_action;
+	struct mlx5hws_action *push_vlan_action;
+	struct mlx5hws_action *drop_action;
+	struct mlx5hws_action *decapl2_action;
+	struct mlx5hws_action *remove_hdr_vlan_action;
+	struct mlx5_fs_pool insert_hdr_pool;
+	struct mlx5_fs_pool dl3tnltol2_pool;
+	struct xarray el2tol3tnl_pools;
+	struct xarray el2tol2tnl_pools;
+	struct xarray mh_pools;
+	struct xarray table_dests;
+	struct xarray vport_vhca_dests;
+	struct xarray vport_dests;
+};
+
+struct mlx5_fs_hws_context {
+	struct mlx5hws_context	*hws_ctx;
+	struct mlx5_fs_hws_actions_pool hws_pool;
+};
+
+struct mlx5_fs_hws_table {
+	struct mlx5hws_table *hws_table;
+	bool miss_ft_set;
+};
+
+struct mlx5_fs_hws_action {
+	struct mlx5hws_action *hws_action;
+	struct mlx5_fs_pool *fs_pool;
+	struct mlx5_fs_hws_pr *pr_data;
+	struct mlx5_fs_hws_mh *mh_data;
+};
+
+struct mlx5_fs_hws_matcher {
+	struct mlx5hws_bwc_matcher *matcher;
+};
+
+struct mlx5_fs_hws_rule_action {
+	struct mlx5hws_action *action;
+	union {
+		struct mlx5_fc *counter;
+	};
+};
+
+struct mlx5_fs_hws_rule {
+	struct mlx5hws_bwc_rule *bwc_rule;
+	struct mlx5_fs_hws_rule_action *hws_fs_actions;
+	int num_fs_actions;
+};
+
+#ifdef CONFIG_MLX5_HW_STEERING
+
+bool mlx5_fs_hws_is_supported(struct mlx5_core_dev *dev);
+
+const struct mlx5_flow_cmds *mlx5_fs_cmd_get_hws_cmds(void);
+
+#else
+
+static inline bool mlx5_fs_hws_is_supported(struct mlx5_core_dev *dev)
+{
+	return false;
+}
+
+static inline const struct mlx5_flow_cmds *mlx5_fs_cmd_get_hws_cmds(void)
+{
+	return NULL;
+}
+
+#endif /* CONFIG_MLX5_HWS_STEERING */
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c
new file mode 100644
index 000000000000..2ae4ac62b0e2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c
@@ -0,0 +1,450 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2025 NVIDIA Corporation & Affiliates */
+
+#include <mlx5_core.h>
+#include "fs_hws_pools.h"
+
+#define MLX5_FS_HWS_DEFAULT_BULK_LEN 65536
+#define MLX5_FS_HWS_POOL_MAX_THRESHOLD BIT(18)
+#define MLX5_FS_HWS_POOL_USED_BUFF_RATIO 10
+
+static struct mlx5hws_action *
+mlx5_fs_dl3tnltol2_bulk_action_create(struct mlx5hws_context *ctx)
+{
+	struct mlx5hws_action_reformat_header reformat_hdr[2] = {};
+	u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB;
+	enum mlx5hws_action_type reformat_type;
+	u32 log_bulk_size;
+
+	reformat_type = MLX5HWS_ACTION_TYP_REFORMAT_TNL_L3_TO_L2;
+	reformat_hdr[MLX5_FS_DL3TNLTOL2_MAC_HDR_IDX].sz = ETH_HLEN;
+	reformat_hdr[MLX5_FS_DL3TNLTOL2_MAC_VLAN_HDR_IDX].sz = ETH_HLEN + VLAN_HLEN;
+
+	log_bulk_size = ilog2(MLX5_FS_HWS_DEFAULT_BULK_LEN);
+	return mlx5hws_action_create_reformat(ctx, reformat_type, 2,
+					      reformat_hdr, log_bulk_size, flags);
+}
+
+static struct mlx5hws_action *
+mlx5_fs_el2tol3tnl_bulk_action_create(struct mlx5hws_context *ctx, size_t data_size)
+{
+	struct mlx5hws_action_reformat_header reformat_hdr = {};
+	u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB;
+	enum mlx5hws_action_type reformat_type;
+	u32 log_bulk_size;
+
+	reformat_type = MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L3;
+	reformat_hdr.sz = data_size;
+
+	log_bulk_size = ilog2(MLX5_FS_HWS_DEFAULT_BULK_LEN);
+	return mlx5hws_action_create_reformat(ctx, reformat_type, 1,
+					      &reformat_hdr, log_bulk_size, flags);
+}
+
+static struct mlx5hws_action *
+mlx5_fs_el2tol2tnl_bulk_action_create(struct mlx5hws_context *ctx, size_t data_size)
+{
+	struct mlx5hws_action_reformat_header reformat_hdr = {};
+	u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB;
+	enum mlx5hws_action_type reformat_type;
+	u32 log_bulk_size;
+
+	reformat_type = MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L2;
+	reformat_hdr.sz = data_size;
+
+	log_bulk_size = ilog2(MLX5_FS_HWS_DEFAULT_BULK_LEN);
+	return mlx5hws_action_create_reformat(ctx, reformat_type, 1,
+					      &reformat_hdr, log_bulk_size, flags);
+}
+
+static struct mlx5hws_action *
+mlx5_fs_insert_hdr_bulk_action_create(struct mlx5hws_context *ctx)
+{
+	struct mlx5hws_action_insert_header insert_hdr = {};
+	u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB;
+	u32 log_bulk_size;
+
+	log_bulk_size = ilog2(MLX5_FS_HWS_DEFAULT_BULK_LEN);
+	insert_hdr.hdr.sz = MLX5_FS_INSERT_HDR_VLAN_SIZE;
+	insert_hdr.anchor = MLX5_FS_INSERT_HDR_VLAN_ANCHOR;
+	insert_hdr.offset = MLX5_FS_INSERT_HDR_VLAN_OFFSET;
+
+	return mlx5hws_action_create_insert_header(ctx, 1, &insert_hdr,
+						   log_bulk_size, flags);
+}
+
+static struct mlx5hws_action *
+mlx5_fs_pr_bulk_action_create(struct mlx5_core_dev *dev,
+			      struct mlx5_fs_hws_pr_pool_ctx *pr_pool_ctx)
+{
+	struct mlx5_flow_root_namespace *root_ns;
+	struct mlx5hws_context *ctx;
+	size_t encap_data_size;
+
+	root_ns = mlx5_get_root_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
+	if (!root_ns || root_ns->mode != MLX5_FLOW_STEERING_MODE_HMFS)
+		return NULL;
+
+	ctx = root_ns->fs_hws_context.hws_ctx;
+	if (!ctx)
+		return NULL;
+
+	encap_data_size = pr_pool_ctx->encap_data_size;
+	switch (pr_pool_ctx->reformat_type) {
+	case MLX5HWS_ACTION_TYP_REFORMAT_TNL_L3_TO_L2:
+		return mlx5_fs_dl3tnltol2_bulk_action_create(ctx);
+	case MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L3:
+		return mlx5_fs_el2tol3tnl_bulk_action_create(ctx, encap_data_size);
+	case MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L2:
+		return mlx5_fs_el2tol2tnl_bulk_action_create(ctx, encap_data_size);
+	case MLX5HWS_ACTION_TYP_INSERT_HEADER:
+		return mlx5_fs_insert_hdr_bulk_action_create(ctx);
+	default:
+		return NULL;
+	}
+	return NULL;
+}
+
+static struct mlx5_fs_bulk *
+mlx5_fs_hws_pr_bulk_create(struct mlx5_core_dev *dev, void *pool_ctx)
+{
+	struct mlx5_fs_hws_pr_pool_ctx *pr_pool_ctx;
+	struct mlx5_fs_hws_pr_bulk *pr_bulk;
+	int bulk_len;
+	int i;
+
+	if (!pool_ctx)
+		return NULL;
+	pr_pool_ctx = pool_ctx;
+	bulk_len = MLX5_FS_HWS_DEFAULT_BULK_LEN;
+	pr_bulk = kvzalloc(struct_size(pr_bulk, prs_data, bulk_len), GFP_KERNEL);
+	if (!pr_bulk)
+		return NULL;
+
+	if (mlx5_fs_bulk_init(dev, &pr_bulk->fs_bulk, bulk_len))
+		goto free_pr_bulk;
+
+	for (i = 0; i < bulk_len; i++) {
+		pr_bulk->prs_data[i].bulk = pr_bulk;
+		pr_bulk->prs_data[i].offset = i;
+	}
+
+	pr_bulk->hws_action = mlx5_fs_pr_bulk_action_create(dev, pr_pool_ctx);
+	if (!pr_bulk->hws_action)
+		goto cleanup_fs_bulk;
+
+	return &pr_bulk->fs_bulk;
+
+cleanup_fs_bulk:
+	mlx5_fs_bulk_cleanup(&pr_bulk->fs_bulk);
+free_pr_bulk:
+	kvfree(pr_bulk);
+	return NULL;
+}
+
+static int
+mlx5_fs_hws_pr_bulk_destroy(struct mlx5_core_dev *dev, struct mlx5_fs_bulk *fs_bulk)
+{
+	struct mlx5_fs_hws_pr_bulk *pr_bulk;
+
+	pr_bulk = container_of(fs_bulk, struct mlx5_fs_hws_pr_bulk, fs_bulk);
+	if (mlx5_fs_bulk_get_free_amount(fs_bulk) < fs_bulk->bulk_len) {
+		mlx5_core_err(dev, "Freeing bulk before all reformats were released\n");
+		return -EBUSY;
+	}
+
+	mlx5hws_action_destroy(pr_bulk->hws_action);
+	mlx5_fs_bulk_cleanup(fs_bulk);
+	kvfree(pr_bulk);
+
+	return 0;
+}
+
+static void mlx5_hws_pool_update_threshold(struct mlx5_fs_pool *hws_pool)
+{
+	hws_pool->threshold = min_t(int, MLX5_FS_HWS_POOL_MAX_THRESHOLD,
+				    hws_pool->used_units / MLX5_FS_HWS_POOL_USED_BUFF_RATIO);
+}
+
+static const struct mlx5_fs_pool_ops mlx5_fs_hws_pr_pool_ops = {
+	.bulk_create = mlx5_fs_hws_pr_bulk_create,
+	.bulk_destroy = mlx5_fs_hws_pr_bulk_destroy,
+	.update_threshold = mlx5_hws_pool_update_threshold,
+};
+
+int mlx5_fs_hws_pr_pool_init(struct mlx5_fs_pool *pr_pool,
+			     struct mlx5_core_dev *dev, size_t encap_data_size,
+			     enum mlx5hws_action_type reformat_type)
+{
+	struct mlx5_fs_hws_pr_pool_ctx *pr_pool_ctx;
+
+	if (reformat_type != MLX5HWS_ACTION_TYP_INSERT_HEADER &&
+	    reformat_type != MLX5HWS_ACTION_TYP_REFORMAT_TNL_L3_TO_L2 &&
+	    reformat_type != MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L3 &&
+	    reformat_type != MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L2)
+		return -EOPNOTSUPP;
+
+	pr_pool_ctx = kzalloc(sizeof(*pr_pool_ctx), GFP_KERNEL);
+	if (!pr_pool_ctx)
+		return -ENOMEM;
+	pr_pool_ctx->reformat_type = reformat_type;
+	pr_pool_ctx->encap_data_size = encap_data_size;
+	mlx5_fs_pool_init(pr_pool, dev, &mlx5_fs_hws_pr_pool_ops, pr_pool_ctx);
+	return 0;
+}
+
+void mlx5_fs_hws_pr_pool_cleanup(struct mlx5_fs_pool *pr_pool)
+{
+	struct mlx5_fs_hws_pr_pool_ctx *pr_pool_ctx;
+
+	mlx5_fs_pool_cleanup(pr_pool);
+	pr_pool_ctx = pr_pool->pool_ctx;
+	if (!pr_pool_ctx)
+		return;
+	kfree(pr_pool_ctx);
+}
+
+struct mlx5_fs_hws_pr *
+mlx5_fs_hws_pr_pool_acquire_pr(struct mlx5_fs_pool *pr_pool)
+{
+	struct mlx5_fs_pool_index pool_index = {};
+	struct mlx5_fs_hws_pr_bulk *pr_bulk;
+	int err;
+
+	err = mlx5_fs_pool_acquire_index(pr_pool, &pool_index);
+	if (err)
+		return ERR_PTR(err);
+	pr_bulk = container_of(pool_index.fs_bulk, struct mlx5_fs_hws_pr_bulk,
+			       fs_bulk);
+	return &pr_bulk->prs_data[pool_index.index];
+}
+
+void mlx5_fs_hws_pr_pool_release_pr(struct mlx5_fs_pool *pr_pool,
+				    struct mlx5_fs_hws_pr *pr_data)
+{
+	struct mlx5_fs_bulk *fs_bulk = &pr_data->bulk->fs_bulk;
+	struct mlx5_fs_pool_index pool_index = {};
+	struct mlx5_core_dev *dev = pr_pool->dev;
+
+	pool_index.fs_bulk = fs_bulk;
+	pool_index.index = pr_data->offset;
+	if (mlx5_fs_pool_release_index(pr_pool, &pool_index))
+		mlx5_core_warn(dev, "Attempted to release packet reformat which is not acquired\n");
+}
+
+struct mlx5hws_action *mlx5_fs_hws_pr_get_action(struct mlx5_fs_hws_pr *pr_data)
+{
+	return pr_data->bulk->hws_action;
+}
+
+static struct mlx5hws_action *
+mlx5_fs_mh_bulk_action_create(struct mlx5hws_context *ctx,
+			      struct mlx5hws_action_mh_pattern *pattern)
+{
+	u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB;
+	u32 log_bulk_size;
+
+	log_bulk_size = ilog2(MLX5_FS_HWS_DEFAULT_BULK_LEN);
+	return mlx5hws_action_create_modify_header(ctx, 1, pattern,
+						   log_bulk_size, flags);
+}
+
+static struct mlx5_fs_bulk *
+mlx5_fs_hws_mh_bulk_create(struct mlx5_core_dev *dev, void *pool_ctx)
+{
+	struct mlx5hws_action_mh_pattern *pattern;
+	struct mlx5_flow_root_namespace *root_ns;
+	struct mlx5_fs_hws_mh_bulk *mh_bulk;
+	struct mlx5hws_context *ctx;
+	int bulk_len;
+
+	if (!pool_ctx)
+		return NULL;
+
+	root_ns = mlx5_get_root_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
+	if (!root_ns || root_ns->mode != MLX5_FLOW_STEERING_MODE_HMFS)
+		return NULL;
+
+	ctx = root_ns->fs_hws_context.hws_ctx;
+	if (!ctx)
+		return NULL;
+
+	pattern = pool_ctx;
+	bulk_len = MLX5_FS_HWS_DEFAULT_BULK_LEN;
+	mh_bulk = kvzalloc(struct_size(mh_bulk, mhs_data, bulk_len), GFP_KERNEL);
+	if (!mh_bulk)
+		return NULL;
+
+	if (mlx5_fs_bulk_init(dev, &mh_bulk->fs_bulk, bulk_len))
+		goto free_mh_bulk;
+
+	for (int i = 0; i < bulk_len; i++) {
+		mh_bulk->mhs_data[i].bulk = mh_bulk;
+		mh_bulk->mhs_data[i].offset = i;
+	}
+
+	mh_bulk->hws_action = mlx5_fs_mh_bulk_action_create(ctx, pattern);
+	if (!mh_bulk->hws_action)
+		goto cleanup_fs_bulk;
+
+	return &mh_bulk->fs_bulk;
+
+cleanup_fs_bulk:
+	mlx5_fs_bulk_cleanup(&mh_bulk->fs_bulk);
+free_mh_bulk:
+	kvfree(mh_bulk);
+	return NULL;
+}
+
+static int
+mlx5_fs_hws_mh_bulk_destroy(struct mlx5_core_dev *dev,
+			    struct mlx5_fs_bulk *fs_bulk)
+{
+	struct mlx5_fs_hws_mh_bulk *mh_bulk;
+
+	mh_bulk = container_of(fs_bulk, struct mlx5_fs_hws_mh_bulk, fs_bulk);
+	if (mlx5_fs_bulk_get_free_amount(fs_bulk) < fs_bulk->bulk_len) {
+		mlx5_core_err(dev, "Freeing bulk before all modify header were released\n");
+		return -EBUSY;
+	}
+
+	mlx5hws_action_destroy(mh_bulk->hws_action);
+	mlx5_fs_bulk_cleanup(fs_bulk);
+	kvfree(mh_bulk);
+
+	return 0;
+}
+
+static const struct mlx5_fs_pool_ops mlx5_fs_hws_mh_pool_ops = {
+	.bulk_create = mlx5_fs_hws_mh_bulk_create,
+	.bulk_destroy = mlx5_fs_hws_mh_bulk_destroy,
+	.update_threshold = mlx5_hws_pool_update_threshold,
+};
+
+int mlx5_fs_hws_mh_pool_init(struct mlx5_fs_pool *fs_hws_mh_pool,
+			     struct mlx5_core_dev *dev,
+			     struct mlx5hws_action_mh_pattern *pattern)
+{
+	struct mlx5hws_action_mh_pattern *pool_pattern;
+
+	pool_pattern = kzalloc(sizeof(*pool_pattern), GFP_KERNEL);
+	if (!pool_pattern)
+		return -ENOMEM;
+	pool_pattern->data = kmemdup(pattern->data, pattern->sz, GFP_KERNEL);
+	if (!pool_pattern->data) {
+		kfree(pool_pattern);
+		return -ENOMEM;
+	}
+	pool_pattern->sz = pattern->sz;
+	mlx5_fs_pool_init(fs_hws_mh_pool, dev, &mlx5_fs_hws_mh_pool_ops,
+			  pool_pattern);
+	return 0;
+}
+
+void mlx5_fs_hws_mh_pool_cleanup(struct mlx5_fs_pool *fs_hws_mh_pool)
+{
+	struct mlx5hws_action_mh_pattern *pool_pattern;
+
+	mlx5_fs_pool_cleanup(fs_hws_mh_pool);
+	pool_pattern = fs_hws_mh_pool->pool_ctx;
+	if (!pool_pattern)
+		return;
+	kfree(pool_pattern->data);
+	kfree(pool_pattern);
+}
+
+struct mlx5_fs_hws_mh *
+mlx5_fs_hws_mh_pool_acquire_mh(struct mlx5_fs_pool *mh_pool)
+{
+	struct mlx5_fs_pool_index pool_index = {};
+	struct mlx5_fs_hws_mh_bulk *mh_bulk;
+	int err;
+
+	err = mlx5_fs_pool_acquire_index(mh_pool, &pool_index);
+	if (err)
+		return ERR_PTR(err);
+	mh_bulk = container_of(pool_index.fs_bulk, struct mlx5_fs_hws_mh_bulk,
+			       fs_bulk);
+	return &mh_bulk->mhs_data[pool_index.index];
+}
+
+void mlx5_fs_hws_mh_pool_release_mh(struct mlx5_fs_pool *mh_pool,
+				    struct mlx5_fs_hws_mh *mh_data)
+{
+	struct mlx5_fs_bulk *fs_bulk = &mh_data->bulk->fs_bulk;
+	struct mlx5_fs_pool_index pool_index = {};
+	struct mlx5_core_dev *dev = mh_pool->dev;
+
+	pool_index.fs_bulk = fs_bulk;
+	pool_index.index = mh_data->offset;
+	if (mlx5_fs_pool_release_index(mh_pool, &pool_index))
+		mlx5_core_warn(dev, "Attempted to release modify header which is not acquired\n");
+}
+
+bool mlx5_fs_hws_mh_pool_match(struct mlx5_fs_pool *mh_pool,
+			       struct mlx5hws_action_mh_pattern *pattern)
+{
+	struct mlx5hws_action_mh_pattern *pool_pattern;
+	int num_actions, i;
+
+	pool_pattern = mh_pool->pool_ctx;
+	if (WARN_ON_ONCE(!pool_pattern))
+		return false;
+
+	if (pattern->sz != pool_pattern->sz)
+		return false;
+	num_actions = pattern->sz / MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto);
+	for (i = 0; i < num_actions; i++) {
+		if ((__force __be32)pattern->data[i] !=
+		    (__force __be32)pool_pattern->data[i])
+			return false;
+	}
+	return true;
+}
+
+struct mlx5hws_action *mlx5_fc_get_hws_action(struct mlx5hws_context *ctx,
+					      struct mlx5_fc *counter)
+{
+	u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED;
+	struct mlx5_fc_bulk *fc_bulk = counter->bulk;
+	struct mlx5_fc_bulk_hws_data *fc_bulk_hws;
+
+	fc_bulk_hws = &fc_bulk->hws_data;
+	/* try avoid locking if not necessary */
+	if (refcount_inc_not_zero(&fc_bulk_hws->hws_action_refcount))
+		return fc_bulk_hws->hws_action;
+
+	mutex_lock(&fc_bulk_hws->lock);
+	if (refcount_inc_not_zero(&fc_bulk_hws->hws_action_refcount)) {
+		mutex_unlock(&fc_bulk_hws->lock);
+		return fc_bulk_hws->hws_action;
+	}
+	fc_bulk_hws->hws_action =
+		mlx5hws_action_create_counter(ctx, fc_bulk->base_id, flags);
+	if (!fc_bulk_hws->hws_action) {
+		mutex_unlock(&fc_bulk_hws->lock);
+		return NULL;
+	}
+	refcount_set(&fc_bulk_hws->hws_action_refcount, 1);
+	mutex_unlock(&fc_bulk_hws->lock);
+
+	return fc_bulk_hws->hws_action;
+}
+
+void mlx5_fc_put_hws_action(struct mlx5_fc *counter)
+{
+	struct mlx5_fc_bulk_hws_data *fc_bulk_hws = &counter->bulk->hws_data;
+
+	/* try avoid locking if not necessary */
+	if (refcount_dec_not_one(&fc_bulk_hws->hws_action_refcount))
+		return;
+
+	mutex_lock(&fc_bulk_hws->lock);
+	if (!refcount_dec_and_test(&fc_bulk_hws->hws_action_refcount)) {
+		mutex_unlock(&fc_bulk_hws->lock);
+		return;
+	}
+	mlx5hws_action_destroy(fc_bulk_hws->hws_action);
+	fc_bulk_hws->hws_action = NULL;
+	mutex_unlock(&fc_bulk_hws->lock);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.h
new file mode 100644
index 000000000000..34072551dd21
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2025 NVIDIA Corporation & Affiliates */
+
+#ifndef __MLX5_FS_HWS_POOLS_H__
+#define __MLX5_FS_HWS_POOLS_H__
+
+#include <linux/if_vlan.h>
+#include "fs_pool.h"
+#include "fs_core.h"
+
+#define MLX5_FS_INSERT_HDR_VLAN_ANCHOR MLX5_REFORMAT_CONTEXT_ANCHOR_MAC_START
+#define MLX5_FS_INSERT_HDR_VLAN_OFFSET offsetof(struct vlan_ethhdr, h_vlan_proto)
+#define MLX5_FS_INSERT_HDR_VLAN_SIZE sizeof(struct vlan_hdr)
+
+enum {
+	MLX5_FS_DL3TNLTOL2_MAC_HDR_IDX = 0,
+	MLX5_FS_DL3TNLTOL2_MAC_VLAN_HDR_IDX,
+};
+
+struct mlx5_fs_hws_pr {
+	struct mlx5_fs_hws_pr_bulk *bulk;
+	u32 offset;
+	u8 hdr_idx;
+	u8 *data;
+	size_t data_size;
+};
+
+struct mlx5_fs_hws_pr_bulk {
+	struct mlx5_fs_bulk fs_bulk;
+	struct mlx5hws_action *hws_action;
+	struct mlx5_fs_hws_pr prs_data[];
+};
+
+struct mlx5_fs_hws_pr_pool_ctx {
+	enum mlx5hws_action_type reformat_type;
+	size_t encap_data_size;
+};
+
+struct mlx5_fs_hws_mh {
+	struct mlx5_fs_hws_mh_bulk *bulk;
+	u32 offset;
+	u8 *data;
+};
+
+struct mlx5_fs_hws_mh_bulk {
+	struct mlx5_fs_bulk fs_bulk;
+	struct mlx5_fs_pool *mh_pool;
+	struct mlx5hws_action *hws_action;
+	struct mlx5_fs_hws_mh mhs_data[];
+};
+
+int mlx5_fs_hws_pr_pool_init(struct mlx5_fs_pool *pr_pool,
+			     struct mlx5_core_dev *dev, size_t encap_data_size,
+			     enum mlx5hws_action_type reformat_type);
+void mlx5_fs_hws_pr_pool_cleanup(struct mlx5_fs_pool *pr_pool);
+
+struct mlx5_fs_hws_pr *mlx5_fs_hws_pr_pool_acquire_pr(struct mlx5_fs_pool *pr_pool);
+void mlx5_fs_hws_pr_pool_release_pr(struct mlx5_fs_pool *pr_pool,
+				    struct mlx5_fs_hws_pr *pr_data);
+struct mlx5hws_action *mlx5_fs_hws_pr_get_action(struct mlx5_fs_hws_pr *pr_data);
+int mlx5_fs_hws_mh_pool_init(struct mlx5_fs_pool *fs_hws_mh_pool,
+			     struct mlx5_core_dev *dev,
+			     struct mlx5hws_action_mh_pattern *pattern);
+void mlx5_fs_hws_mh_pool_cleanup(struct mlx5_fs_pool *fs_hws_mh_pool);
+struct mlx5_fs_hws_mh *mlx5_fs_hws_mh_pool_acquire_mh(struct mlx5_fs_pool *mh_pool);
+void mlx5_fs_hws_mh_pool_release_mh(struct mlx5_fs_pool *mh_pool,
+				    struct mlx5_fs_hws_mh *mh_data);
+bool mlx5_fs_hws_mh_pool_match(struct mlx5_fs_pool *mh_pool,
+			       struct mlx5hws_action_mh_pattern *pattern);
+struct mlx5hws_action *mlx5_fc_get_hws_action(struct mlx5hws_context *ctx,
+					      struct mlx5_fc *counter);
+void mlx5_fc_put_hws_action(struct mlx5_fc *counter);
+#endif /* __MLX5_FS_HWS_POOLS_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h
index 3c8635f286ce..30ccd635b505 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h
@@ -39,7 +39,6 @@
 #define mlx5hws_dbg(ctx, arg...) mlx5_core_dbg((ctx)->mdev, ##arg)
 
 #define MLX5HWS_TABLE_TYPE_BASE 2
-#define MLX5HWS_ACTION_STE_IDX_ANY 0
 
 static inline bool is_mem_zero(const u8 *mem, size_t size)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
index 1bb3a6f8c3cd..80157a29a076 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
@@ -165,14 +165,14 @@ static int hws_matcher_disconnect(struct mlx5hws_matcher *matcher)
 						    next->match_ste.rtc_0_id,
 						    next->match_ste.rtc_1_id);
 		if (ret) {
-			mlx5hws_err(tbl->ctx, "Failed to disconnect matcher\n");
-			goto matcher_reconnect;
+			mlx5hws_err(tbl->ctx, "Fatal error, failed to disconnect matcher\n");
+			return ret;
 		}
 	} else {
 		ret = mlx5hws_table_connect_to_miss_table(tbl, tbl->default_miss.miss_tbl);
 		if (ret) {
-			mlx5hws_err(tbl->ctx, "Failed to disconnect last matcher\n");
-			goto matcher_reconnect;
+			mlx5hws_err(tbl->ctx, "Fatal error, failed to disconnect last matcher\n");
+			return ret;
 		}
 	}
 
@@ -180,27 +180,19 @@ static int hws_matcher_disconnect(struct mlx5hws_matcher *matcher)
 	if (prev_ft_id == tbl->ft_id) {
 		ret = mlx5hws_table_update_connected_miss_tables(tbl);
 		if (ret) {
-			mlx5hws_err(tbl->ctx, "Fatal error, failed to update connected miss table\n");
-			goto matcher_reconnect;
+			mlx5hws_err(tbl->ctx,
+				    "Fatal error, failed to update connected miss table\n");
+			return ret;
 		}
 	}
 
 	ret = mlx5hws_table_ft_set_default_next_ft(tbl, prev_ft_id);
 	if (ret) {
 		mlx5hws_err(tbl->ctx, "Fatal error, failed to restore matcher ft default miss\n");
-		goto matcher_reconnect;
+		return ret;
 	}
 
 	return 0;
-
-matcher_reconnect:
-	if (list_empty(&tbl->matchers_list) || !prev)
-		list_add(&matcher->list_node, &tbl->matchers_list);
-	else
-		/* insert after prev matcher */
-		list_add(&matcher->list_node, &prev->list_node);
-
-	return ret;
 }
 
 static void hws_matcher_set_rtc_attr_sz(struct mlx5hws_matcher *matcher,
@@ -208,7 +200,7 @@ static void hws_matcher_set_rtc_attr_sz(struct mlx5hws_matcher *matcher,
 					enum mlx5hws_matcher_rtc_type rtc_type,
 					bool is_mirror)
 {
-	struct mlx5hws_pool_chunk *ste = &matcher->action_ste[MLX5HWS_ACTION_STE_IDX_ANY].ste;
+	struct mlx5hws_pool_chunk *ste = &matcher->action_ste.ste;
 	enum mlx5hws_matcher_flow_src flow_src = matcher->attr.optimize_flow_src;
 	bool is_match_rtc = rtc_type == HWS_MATCHER_RTC_TYPE_MATCH;
 
@@ -225,8 +217,7 @@ static void hws_matcher_set_rtc_attr_sz(struct mlx5hws_matcher *matcher,
 }
 
 static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher,
-				  enum mlx5hws_matcher_rtc_type rtc_type,
-				  u8 action_ste_selector)
+				  enum mlx5hws_matcher_rtc_type rtc_type)
 {
 	struct mlx5hws_matcher_attr *attr = &matcher->attr;
 	struct mlx5hws_cmd_rtc_create_attr rtc_attr = {0};
@@ -286,14 +277,19 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher,
 		break;
 
 	case HWS_MATCHER_RTC_TYPE_STE_ARRAY:
-		action_ste = &matcher->action_ste[action_ste_selector];
+		action_ste = &matcher->action_ste;
 
 		rtc_0_id = &action_ste->rtc_0_id;
 		rtc_1_id = &action_ste->rtc_1_id;
 		ste_pool = action_ste->pool;
 		ste = &action_ste->ste;
+		/* Action RTC size calculation:
+		 * log((max number of rules in matcher) *
+		 *     (max number of action STEs per rule) *
+		 *     (2 to support writing new STEs for update rule))
+		 */
 		ste->order = ilog2(roundup_pow_of_two(action_ste->max_stes)) +
-			     attr->table.sz_row_log;
+			     attr->table.sz_row_log + 1;
 		rtc_attr.log_size = ste->order;
 		rtc_attr.log_depth = 0;
 		rtc_attr.update_index_mode = MLX5_IFC_RTC_STE_UPDATE_MODE_BY_OFFSET;
@@ -318,8 +314,8 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher,
 	hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, rtc_type, false);
 
 	/* STC is a single resource (obj_id), use any STC for the ID */
-	stc_pool = ctx->stc_pool[tbl->type];
-	default_stc = ctx->common_res[tbl->type].default_stc;
+	stc_pool = ctx->stc_pool;
+	default_stc = ctx->common_res.default_stc;
 	obj_id = mlx5hws_pool_chunk_get_base_id(stc_pool, &default_stc->default_hit);
 	rtc_attr.stc_base = obj_id;
 
@@ -358,8 +354,7 @@ free_ste:
 }
 
 static void hws_matcher_destroy_rtc(struct mlx5hws_matcher *matcher,
-				    enum mlx5hws_matcher_rtc_type rtc_type,
-				    u8 action_ste_selector)
+				    enum mlx5hws_matcher_rtc_type rtc_type)
 {
 	struct mlx5hws_matcher_action_ste *action_ste;
 	struct mlx5hws_table *tbl = matcher->tbl;
@@ -375,7 +370,7 @@ static void hws_matcher_destroy_rtc(struct mlx5hws_matcher *matcher,
 		ste = &matcher->match_ste.ste;
 		break;
 	case HWS_MATCHER_RTC_TYPE_STE_ARRAY:
-		action_ste = &matcher->action_ste[action_ste_selector];
+		action_ste = &matcher->action_ste;
 		rtc_0_id = action_ste->rtc_0_id;
 		rtc_1_id = action_ste->rtc_1_id;
 		ste_pool = action_ste->pool;
@@ -466,20 +461,13 @@ static int hws_matcher_resize_init(struct mlx5hws_matcher *src_matcher)
 	if (!resize_data)
 		return -ENOMEM;
 
-	resize_data->max_stes = src_matcher->action_ste[MLX5HWS_ACTION_STE_IDX_ANY].max_stes;
-
-	resize_data->action_ste[0].stc = src_matcher->action_ste[0].stc;
-	resize_data->action_ste[0].rtc_0_id = src_matcher->action_ste[0].rtc_0_id;
-	resize_data->action_ste[0].rtc_1_id = src_matcher->action_ste[0].rtc_1_id;
-	resize_data->action_ste[0].pool = src_matcher->action_ste[0].max_stes ?
-					  src_matcher->action_ste[0].pool :
-					  NULL;
-	resize_data->action_ste[1].stc = src_matcher->action_ste[1].stc;
-	resize_data->action_ste[1].rtc_0_id = src_matcher->action_ste[1].rtc_0_id;
-	resize_data->action_ste[1].rtc_1_id = src_matcher->action_ste[1].rtc_1_id;
-	resize_data->action_ste[1].pool = src_matcher->action_ste[1].max_stes ?
-					  src_matcher->action_ste[1].pool :
-					   NULL;
+	resize_data->max_stes = src_matcher->action_ste.max_stes;
+
+	resize_data->stc = src_matcher->action_ste.stc;
+	resize_data->rtc_0_id = src_matcher->action_ste.rtc_0_id;
+	resize_data->rtc_1_id = src_matcher->action_ste.rtc_1_id;
+	resize_data->pool = src_matcher->action_ste.max_stes ?
+			    src_matcher->action_ste.pool : NULL;
 
 	/* Place the new resized matcher on the dst matcher's list */
 	list_add(&resize_data->list_node, &src_matcher->resize_dst->resize_data);
@@ -512,49 +500,68 @@ static void hws_matcher_resize_uninit(struct mlx5hws_matcher *matcher)
 		if (resize_data->max_stes) {
 			mlx5hws_action_free_single_stc(matcher->tbl->ctx,
 						       matcher->tbl->type,
-						       &resize_data->action_ste[1].stc);
-			mlx5hws_action_free_single_stc(matcher->tbl->ctx,
-						       matcher->tbl->type,
-						       &resize_data->action_ste[0].stc);
+						       &resize_data->stc);
 
-			if (matcher->tbl->type == MLX5HWS_TABLE_TYPE_FDB) {
+			if (matcher->tbl->type == MLX5HWS_TABLE_TYPE_FDB)
 				mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev,
-							resize_data->action_ste[1].rtc_1_id);
-				mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev,
-							resize_data->action_ste[0].rtc_1_id);
-			}
-			mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev,
-						resize_data->action_ste[1].rtc_0_id);
+							resize_data->rtc_1_id);
+
 			mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev,
-						resize_data->action_ste[0].rtc_0_id);
-			if (resize_data->action_ste[MLX5HWS_ACTION_STE_IDX_ANY].pool) {
-				mlx5hws_pool_destroy(resize_data->action_ste[1].pool);
-				mlx5hws_pool_destroy(resize_data->action_ste[0].pool);
-			}
+						resize_data->rtc_0_id);
+
+			if (resize_data->pool)
+				mlx5hws_pool_destroy(resize_data->pool);
 		}
 
 		kfree(resize_data);
 	}
 }
 
-static int
-hws_matcher_bind_at_idx(struct mlx5hws_matcher *matcher, u8 action_ste_selector)
+static int hws_matcher_bind_at(struct mlx5hws_matcher *matcher)
 {
+	bool is_jumbo = mlx5hws_matcher_mt_is_jumbo(matcher->mt);
 	struct mlx5hws_cmd_stc_modify_attr stc_attr = {0};
 	struct mlx5hws_matcher_action_ste *action_ste;
 	struct mlx5hws_table *tbl = matcher->tbl;
 	struct mlx5hws_pool_attr pool_attr = {0};
 	struct mlx5hws_context *ctx = tbl->ctx;
-	int ret;
+	u32 required_stes;
+	u8 max_stes = 0;
+	int i, ret;
+
+	if (matcher->flags & MLX5HWS_MATCHER_FLAGS_COLLISION)
+		return 0;
+
+	for (i = 0; i < matcher->num_of_at; i++) {
+		struct mlx5hws_action_template *at = &matcher->at[i];
+
+		ret = hws_matcher_check_and_process_at(matcher, at);
+		if (ret) {
+			mlx5hws_err(ctx, "Invalid at %d", i);
+			return ret;
+		}
+
+		required_stes = at->num_of_action_stes - (!is_jumbo || at->only_term);
+		max_stes = max(max_stes, required_stes);
+
+		/* Future: Optimize reparse */
+	}
+
+	/* There are no additional STEs required for matcher */
+	if (!max_stes)
+		return 0;
+
+	matcher->action_ste.max_stes = max_stes;
 
-	action_ste = &matcher->action_ste[action_ste_selector];
+	action_ste = &matcher->action_ste;
 
 	/* Allocate action STE mempool */
 	pool_attr.table_type = tbl->type;
 	pool_attr.pool_type = MLX5HWS_POOL_TYPE_STE;
 	pool_attr.flags = MLX5HWS_POOL_FLAGS_FOR_STE_ACTION_POOL;
+	/* Pool size is similar to action RTC size */
 	pool_attr.alloc_log_sz = ilog2(roundup_pow_of_two(action_ste->max_stes)) +
-				 matcher->attr.table.sz_row_log;
+				 matcher->attr.table.sz_row_log + 1;
 	hws_matcher_set_pool_attr(&pool_attr, matcher);
 	action_ste->pool = mlx5hws_pool_create(ctx, &pool_attr);
 	if (!action_ste->pool) {
@@ -563,7 +570,7 @@ hws_matcher_bind_at_idx(struct mlx5hws_matcher *matcher, u8 action_ste_selector)
 	}
 
 	/* Allocate action RTC */
-	ret = hws_matcher_create_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY, action_ste_selector);
+	ret = hws_matcher_create_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY);
 	if (ret) {
 		mlx5hws_err(ctx, "Failed to create action RTC\n");
 		goto free_ste_pool;
@@ -587,18 +594,18 @@ hws_matcher_bind_at_idx(struct mlx5hws_matcher *matcher, u8 action_ste_selector)
 	return 0;
 
 free_rtc:
-	hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY, action_ste_selector);
+	hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY);
 free_ste_pool:
 	mlx5hws_pool_destroy(action_ste->pool);
 	return ret;
 }
 
-static void hws_matcher_unbind_at_idx(struct mlx5hws_matcher *matcher, u8 action_ste_selector)
+static void hws_matcher_unbind_at(struct mlx5hws_matcher *matcher)
 {
 	struct mlx5hws_matcher_action_ste *action_ste;
 	struct mlx5hws_table *tbl = matcher->tbl;
 
-	action_ste = &matcher->action_ste[action_ste_selector];
+	action_ste = &matcher->action_ste;
 
 	if (!action_ste->max_stes ||
 	    matcher->flags & MLX5HWS_MATCHER_FLAGS_COLLISION ||
@@ -606,65 +613,10 @@ static void hws_matcher_unbind_at_idx(struct mlx5hws_matcher *matcher, u8 action
 		return;
 
 	mlx5hws_action_free_single_stc(tbl->ctx, tbl->type, &action_ste->stc);
-	hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY, action_ste_selector);
+	hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY);
 	mlx5hws_pool_destroy(action_ste->pool);
 }
 
-static int hws_matcher_bind_at(struct mlx5hws_matcher *matcher)
-{
-	bool is_jumbo = mlx5hws_matcher_mt_is_jumbo(matcher->mt);
-	struct mlx5hws_table *tbl = matcher->tbl;
-	struct mlx5hws_context *ctx = tbl->ctx;
-	u32 required_stes;
-	u8 max_stes = 0;
-	int i, ret;
-
-	if (matcher->flags & MLX5HWS_MATCHER_FLAGS_COLLISION)
-		return 0;
-
-	for (i = 0; i < matcher->num_of_at; i++) {
-		struct mlx5hws_action_template *at = &matcher->at[i];
-
-		ret = hws_matcher_check_and_process_at(matcher, at);
-		if (ret) {
-			mlx5hws_err(ctx, "Invalid at %d", i);
-			return ret;
-		}
-
-		required_stes = at->num_of_action_stes - (!is_jumbo || at->only_term);
-		max_stes = max(max_stes, required_stes);
-
-		/* Future: Optimize reparse */
-	}
-
-	/* There are no additional STEs required for matcher */
-	if (!max_stes)
-		return 0;
-
-	matcher->action_ste[0].max_stes = max_stes;
-	matcher->action_ste[1].max_stes = max_stes;
-
-	ret = hws_matcher_bind_at_idx(matcher, 0);
-	if (ret)
-		return ret;
-
-	ret = hws_matcher_bind_at_idx(matcher, 1);
-	if (ret)
-		goto free_at_0;
-
-	return 0;
-
-free_at_0:
-	hws_matcher_unbind_at_idx(matcher, 0);
-	return ret;
-}
-
-static void hws_matcher_unbind_at(struct mlx5hws_matcher *matcher)
-{
-	hws_matcher_unbind_at_idx(matcher, 1);
-	hws_matcher_unbind_at_idx(matcher, 0);
-}
-
 static int hws_matcher_bind_mt(struct mlx5hws_matcher *matcher)
 {
 	struct mlx5hws_context *ctx = matcher->tbl->ctx;
@@ -810,7 +762,7 @@ static int hws_matcher_create_and_connect(struct mlx5hws_matcher *matcher)
 		goto unbind_at;
 
 	/* Allocate the RTC for the new matcher */
-	ret = hws_matcher_create_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH, 0);
+	ret = hws_matcher_create_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH);
 	if (ret)
 		goto destroy_end_ft;
 
@@ -822,7 +774,7 @@ static int hws_matcher_create_and_connect(struct mlx5hws_matcher *matcher)
 	return 0;
 
 destroy_rtc:
-	hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH, 0);
+	hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH);
 destroy_end_ft:
 	hws_matcher_destroy_end_ft(matcher);
 unbind_at:
@@ -836,7 +788,7 @@ static void hws_matcher_destroy_and_disconnect(struct mlx5hws_matcher *matcher)
 {
 	hws_matcher_resize_uninit(matcher);
 	hws_matcher_disconnect(matcher);
-	hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH, 0);
+	hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH);
 	hws_matcher_destroy_end_ft(matcher);
 	hws_matcher_unbind_at(matcher);
 	hws_matcher_unbind_mt(matcher);
@@ -970,10 +922,9 @@ int mlx5hws_matcher_attach_at(struct mlx5hws_matcher *matcher,
 		return ret;
 
 	required_stes = at->num_of_action_stes - (!is_jumbo || at->only_term);
-	if (matcher->action_ste[MLX5HWS_ACTION_STE_IDX_ANY].max_stes < required_stes) {
+	if (matcher->action_ste.max_stes < required_stes) {
 		mlx5hws_dbg(ctx, "Required STEs [%d] exceeds initial action template STE [%d]\n",
-			    required_stes,
-			    matcher->action_ste[MLX5HWS_ACTION_STE_IDX_ANY].max_stes);
+			    required_stes, matcher->action_ste.max_stes);
 		return -ENOMEM;
 	}
 
@@ -1007,9 +958,9 @@ hws_matcher_set_templates(struct mlx5hws_matcher *matcher,
 	if (!matcher->mt)
 		return -ENOMEM;
 
-	matcher->at = kcalloc(num_of_at + matcher->attr.max_num_of_at_attach,
-			      sizeof(*matcher->at),
-			      GFP_KERNEL);
+	matcher->at = kvcalloc(num_of_at + matcher->attr.max_num_of_at_attach,
+			       sizeof(*matcher->at),
+			       GFP_KERNEL);
 	if (!matcher->at) {
 		mlx5hws_err(ctx, "Failed to allocate action template array\n");
 		ret = -ENOMEM;
@@ -1035,7 +986,7 @@ free_mt:
 static void
 hws_matcher_unset_templates(struct mlx5hws_matcher *matcher)
 {
-	kfree(matcher->at);
+	kvfree(matcher->at);
 	kfree(matcher->mt);
 }
 
@@ -1157,8 +1108,7 @@ static int hws_matcher_resize_precheck(struct mlx5hws_matcher *src_matcher,
 		return -EINVAL;
 	}
 
-	if (src_matcher->action_ste[MLX5HWS_ACTION_STE_IDX_ANY].max_stes >
-	    dst_matcher->action_ste[0].max_stes) {
+	if (src_matcher->action_ste.max_stes > dst_matcher->action_ste.max_stes) {
 		mlx5hws_err(ctx, "Src/dst matcher max STEs mismatch\n");
 		return -EINVAL;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h
index 81ff487f57be..cff4ae854a79 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h
@@ -52,15 +52,11 @@ struct mlx5hws_matcher_action_ste {
 	u8 max_stes;
 };
 
-struct mlx5hws_matcher_resize_data_node {
+struct mlx5hws_matcher_resize_data {
 	struct mlx5hws_pool_chunk stc;
 	u32 rtc_0_id;
 	u32 rtc_1_id;
 	struct mlx5hws_pool *pool;
-};
-
-struct mlx5hws_matcher_resize_data {
-	struct mlx5hws_matcher_resize_data_node action_ste[2];
 	u8 max_stes;
 	struct list_head list_node;
 };
@@ -78,7 +74,7 @@ struct mlx5hws_matcher {
 	struct mlx5hws_matcher *col_matcher;
 	struct mlx5hws_matcher *resize_dst;
 	struct mlx5hws_matcher_match_ste match_ste;
-	struct mlx5hws_matcher_action_ste action_ste[2];
+	struct mlx5hws_matcher_action_ste action_ste;
 	struct list_head list_node;
 	struct list_head resize_data;
 };
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c
index 06db5e4726ae..d9dc4f2d0dc6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c
@@ -344,7 +344,7 @@ void mlx5hws_arg_write(struct mlx5hws_send_engine *queue,
 		mlx5hws_send_engine_post_req_wqe(&ctrl, (void *)&wqe_ctrl, &wqe_len);
 		memset(wqe_ctrl, 0, wqe_len);
 		mlx5hws_send_engine_post_req_wqe(&ctrl, (void *)&wqe_arg, &wqe_len);
-		memcpy(wqe_arg, arg_data, wqe_len);
+		memcpy(wqe_arg, arg_data, MLX5HWS_ARG_DATA_SIZE);
 		send_attr.id = arg_idx++;
 		mlx5hws_send_engine_post_end(&ctrl, &send_attr);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h
index 27ca93385b08..8ddb51980044 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h
@@ -31,7 +31,7 @@ struct mlx5hws_pattern_cache_item {
 		u8 *data;
 		u16 num_of_actions;
 	} mh_data;
-	u32 refcount;
+	u32 refcount; /* protected by pattern_cache lock */
 	struct list_head ptrn_list_node;
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c
index fed2d913f3b8..50a81d360bb2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c
@@ -183,7 +183,7 @@ static int hws_pool_buddy_get_mem_chunk(struct mlx5hws_pool *pool,
 	*seg = -1;
 
 	/* Find the next free place from the buddy array */
-	while (*seg == -1) {
+	while (*seg < 0) {
 		for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++) {
 			buddy = hws_pool_buddy_get_next_buddy(pool, i,
 							      order,
@@ -194,7 +194,7 @@ static int hws_pool_buddy_get_mem_chunk(struct mlx5hws_pool *pool,
 			}
 
 			*seg = mlx5hws_buddy_alloc_mem(buddy, order);
-			if (*seg != -1)
+			if (*seg >= 0)
 				goto found;
 
 			if (pool->flags & MLX5HWS_POOL_FLAGS_ONE_RESOURCE) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/prm.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/prm.h
index de92cecbeb92..271490a51b96 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/prm.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/prm.h
@@ -390,11 +390,6 @@ struct mlx5_ifc_definer_bits {
 	u8 match_mask[0x160];
 };
 
-struct mlx5_ifc_arg_bits {
-	u8 rsvd0[0x88];
-	u8 access_pd[0x18];
-};
-
 struct mlx5_ifc_header_modify_pattern_in_bits {
 	u8 modify_field_select[0x40];
 
@@ -428,11 +423,6 @@ struct mlx5_ifc_create_definer_in_bits {
 	struct mlx5_ifc_definer_bits definer;
 };
 
-struct mlx5_ifc_create_arg_in_bits {
-	struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
-	struct mlx5_ifc_arg_bits arg;
-};
-
 struct mlx5_ifc_create_header_modify_pattern_in_bits {
 	struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
 	struct mlx5_ifc_header_modify_pattern_in_bits pattern;
@@ -479,36 +469,4 @@ enum {
 	MLX5_IFC_MODIFY_FLOW_TABLE_MISS_ACTION_GOTO_TBL = 1,
 };
 
-struct mlx5_ifc_alloc_packet_reformat_out_bits {
-	u8 status[0x8];
-	u8 reserved_at_8[0x18];
-
-	u8 syndrome[0x20];
-
-	u8 packet_reformat_id[0x20];
-
-	u8 reserved_at_60[0x20];
-};
-
-struct mlx5_ifc_dealloc_packet_reformat_in_bits {
-	u8 opcode[0x10];
-	u8 reserved_at_10[0x10];
-
-	u8 reserved_at_20[0x10];
-	u8 op_mod[0x10];
-
-	u8 packet_reformat_id[0x20];
-
-	u8 reserved_at_60[0x20];
-};
-
-struct mlx5_ifc_dealloc_packet_reformat_out_bits {
-	u8 status[0x8];
-	u8 reserved_at_8[0x18];
-
-	u8 syndrome[0x20];
-
-	u8 reserved_at_40[0x40];
-};
-
 #endif /* MLX5_PRM_H_ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c
index e20c67a04203..a27a2d5ffc7b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c
@@ -129,27 +129,18 @@ static void hws_rule_gen_comp(struct mlx5hws_send_engine *queue,
 
 static void
 hws_rule_save_resize_info(struct mlx5hws_rule *rule,
-			  struct mlx5hws_send_ste_attr *ste_attr,
-			  bool is_update)
+			  struct mlx5hws_send_ste_attr *ste_attr)
 {
 	if (!mlx5hws_matcher_is_resizable(rule->matcher))
 		return;
 
-	if (likely(!is_update)) {
+	/* resize_info might already exist (if we're in update flow) */
+	if (likely(!rule->resize_info)) {
 		rule->resize_info = kzalloc(sizeof(*rule->resize_info), GFP_KERNEL);
 		if (unlikely(!rule->resize_info)) {
 			pr_warn("HWS: resize info isn't allocated for rule\n");
 			return;
 		}
-
-		rule->resize_info->max_stes =
-			rule->matcher->action_ste[MLX5HWS_ACTION_STE_IDX_ANY].max_stes;
-		rule->resize_info->action_ste_pool[0] = rule->matcher->action_ste[0].max_stes ?
-							rule->matcher->action_ste[0].pool :
-							NULL;
-		rule->resize_info->action_ste_pool[1] = rule->matcher->action_ste[1].max_stes ?
-							rule->matcher->action_ste[1].pool :
-							NULL;
 	}
 
 	memcpy(rule->resize_info->ctrl_seg, ste_attr->wqe_ctrl,
@@ -204,15 +195,14 @@ hws_rule_load_delete_info(struct mlx5hws_rule *rule,
 	}
 }
 
-static int hws_rule_alloc_action_ste_idx(struct mlx5hws_rule *rule,
-					 u8 action_ste_selector)
+static int hws_rule_alloc_action_ste(struct mlx5hws_rule *rule)
 {
 	struct mlx5hws_matcher *matcher = rule->matcher;
 	struct mlx5hws_matcher_action_ste *action_ste;
 	struct mlx5hws_pool_chunk ste = {0};
 	int ret;
 
-	action_ste = &matcher->action_ste[action_ste_selector];
+	action_ste = &matcher->action_ste;
 	ste.order = ilog2(roundup_pow_of_two(action_ste->max_stes));
 	ret = mlx5hws_pool_chunk_alloc(action_ste->pool, &ste);
 	if (unlikely(ret)) {
@@ -220,68 +210,29 @@ static int hws_rule_alloc_action_ste_idx(struct mlx5hws_rule *rule,
 			    "Failed to allocate STE for rule actions");
 		return ret;
 	}
-	rule->action_ste_idx = ste.offset;
+
+	rule->action_ste.pool = matcher->action_ste.pool;
+	rule->action_ste.num_stes = matcher->action_ste.max_stes;
+	rule->action_ste.index = ste.offset;
 
 	return 0;
 }
 
-static void hws_rule_free_action_ste_idx(struct mlx5hws_rule *rule,
-					 u8 action_ste_selector)
+void mlx5hws_rule_free_action_ste(struct mlx5hws_rule_action_ste_info *action_ste)
 {
-	struct mlx5hws_matcher *matcher = rule->matcher;
 	struct mlx5hws_pool_chunk ste = {0};
-	struct mlx5hws_pool *pool;
-	u8 max_stes;
-
-	if (mlx5hws_matcher_is_resizable(matcher)) {
-		/* Free the original action pool if rule was resized */
-		max_stes = rule->resize_info->max_stes;
-		pool = rule->resize_info->action_ste_pool[action_ste_selector];
-	} else {
-		max_stes = matcher->action_ste[action_ste_selector].max_stes;
-		pool = matcher->action_ste[action_ste_selector].pool;
-	}
-
-	/* This release is safe only when the rule match part was deleted */
-	ste.order = ilog2(roundup_pow_of_two(max_stes));
-	ste.offset = rule->action_ste_idx;
-
-	mlx5hws_pool_chunk_free(pool, &ste);
-}
 
-static int hws_rule_alloc_action_ste(struct mlx5hws_rule *rule,
-				     struct mlx5hws_rule_attr *attr)
-{
-	int action_ste_idx;
-	int ret;
-
-	ret = hws_rule_alloc_action_ste_idx(rule, 0);
-	if (unlikely(ret))
-		return ret;
-
-	action_ste_idx = rule->action_ste_idx;
-
-	ret = hws_rule_alloc_action_ste_idx(rule, 1);
-	if (unlikely(ret)) {
-		hws_rule_free_action_ste_idx(rule, 0);
-		return ret;
-	}
-
-	/* Both pools have to return the same index */
-	if (unlikely(rule->action_ste_idx != action_ste_idx)) {
-		pr_warn("HWS: allocation of action STE failed - pool indexes mismatch\n");
-		return -EINVAL;
-	}
+	if (!action_ste->num_stes)
+		return;
 
-	return 0;
-}
+	ste.order = ilog2(roundup_pow_of_two(action_ste->num_stes));
+	ste.offset = action_ste->index;
 
-void mlx5hws_rule_free_action_ste(struct mlx5hws_rule *rule)
-{
-	if (rule->action_ste_idx > -1) {
-		hws_rule_free_action_ste_idx(rule, 1);
-		hws_rule_free_action_ste_idx(rule, 0);
-	}
+	/* This release is safe only when the rule match STE was deleted
+	 * (when the rule is being deleted) or replaced with the new STE that
+	 * isn't pointing to old action STEs (when the rule is being updated).
+	 */
+	mlx5hws_pool_chunk_free(action_ste->pool, &ste);
 }
 
 static void hws_rule_create_init(struct mlx5hws_rule *rule,
@@ -298,14 +249,24 @@ static void hws_rule_create_init(struct mlx5hws_rule *rule,
 		/* In update we use these rtc's */
 		rule->rtc_0 = 0;
 		rule->rtc_1 = 0;
-		rule->action_ste_selector = 0;
+
+		rule->action_ste.pool = NULL;
+		rule->action_ste.num_stes = 0;
+		rule->action_ste.index = -1;
+
+		rule->status = MLX5HWS_RULE_STATUS_CREATING;
 	} else {
-		rule->action_ste_selector = !rule->action_ste_selector;
+		rule->status = MLX5HWS_RULE_STATUS_UPDATING;
 	}
 
+	/* Initialize the old action STE info - shallow-copy action_ste.
+	 * In create flow this will set old_action_ste fields to initial values.
+	 * In update flow this will save the existing action STE info,
+	 * so that we will later use it to free old STEs.
+	 */
+	rule->old_action_ste = rule->action_ste;
+
 	rule->pending_wqes = 0;
-	rule->action_ste_idx = -1;
-	rule->status = MLX5HWS_RULE_STATUS_CREATING;
 
 	/* Init default send STE attributes */
 	ste_attr->gta_opcode = MLX5HWS_WQE_GTA_OP_ACTIVATE;
@@ -315,8 +276,8 @@ static void hws_rule_create_init(struct mlx5hws_rule *rule,
 
 	/* Init default action apply */
 	apply->tbl_type = tbl->type;
-	apply->common_res = &ctx->common_res[tbl->type];
-	apply->jump_to_action_stc = matcher->action_ste[0].stc.offset;
+	apply->common_res = &ctx->common_res;
+	apply->jump_to_action_stc = matcher->action_ste.stc.offset;
 	apply->require_dep = 0;
 }
 
@@ -332,8 +293,6 @@ static void hws_rule_move_init(struct mlx5hws_rule *rule,
 	rule->rtc_1 = 0;
 
 	rule->pending_wqes = 0;
-	rule->action_ste_idx = -1;
-	rule->action_ste_selector = 0;
 	rule->status = MLX5HWS_RULE_STATUS_CREATING;
 	rule->resize_info->state = MLX5HWS_RULE_RESIZE_STATE_WRITING;
 }
@@ -394,21 +353,17 @@ static int hws_rule_create_hws(struct mlx5hws_rule *rule,
 
 	if (action_stes) {
 		/* Allocate action STEs for rules that need more than match STE */
-		if (!is_update) {
-			ret = hws_rule_alloc_action_ste(rule, attr);
-			if (ret) {
-				mlx5hws_err(ctx, "Failed to allocate action memory %d", ret);
-				mlx5hws_send_abort_new_dep_wqe(queue);
-				return ret;
-			}
+		ret = hws_rule_alloc_action_ste(rule);
+		if (ret) {
+			mlx5hws_err(ctx, "Failed to allocate action memory %d", ret);
+			mlx5hws_send_abort_new_dep_wqe(queue);
+			return ret;
 		}
 		/* Skip RX/TX based on the dep_wqe init */
-		ste_attr.rtc_0 = dep_wqe->rtc_0 ?
-				 matcher->action_ste[rule->action_ste_selector].rtc_0_id : 0;
-		ste_attr.rtc_1 = dep_wqe->rtc_1 ?
-				 matcher->action_ste[rule->action_ste_selector].rtc_1_id : 0;
+		ste_attr.rtc_0 = dep_wqe->rtc_0 ? matcher->action_ste.rtc_0_id : 0;
+		ste_attr.rtc_1 = dep_wqe->rtc_1 ? matcher->action_ste.rtc_1_id : 0;
 		/* Action STEs are written to a specific index last to first */
-		ste_attr.direct_index = rule->action_ste_idx + action_stes;
+		ste_attr.direct_index = rule->action_ste.index + action_stes;
 		apply.next_direct_idx = ste_attr.direct_index;
 	} else {
 		apply.next_direct_idx = 0;
@@ -459,7 +414,7 @@ static int hws_rule_create_hws(struct mlx5hws_rule *rule,
 	if (!is_update)
 		hws_rule_save_delete_info(rule, &ste_attr);
 
-	hws_rule_save_resize_info(rule, &ste_attr, is_update);
+	hws_rule_save_resize_info(rule, &ste_attr);
 	mlx5hws_send_engine_inc_rule(queue);
 
 	if (!attr->burst)
@@ -480,7 +435,10 @@ static void hws_rule_destroy_failed_hws(struct mlx5hws_rule *rule,
 			  attr->user_data, MLX5HWS_RULE_STATUS_DELETED);
 
 	/* Rule failed now we can safely release action STEs */
-	mlx5hws_rule_free_action_ste(rule);
+	mlx5hws_rule_free_action_ste(&rule->action_ste);
+
+	/* Perhaps the rule failed updating - release old action STEs as well */
+	mlx5hws_rule_free_action_ste(&rule->old_action_ste);
 
 	/* Clear complex tag */
 	hws_rule_clear_delete_info(rule);
@@ -517,7 +475,8 @@ static int hws_rule_destroy_hws(struct mlx5hws_rule *rule,
 	}
 
 	/* Rule is not completed yet */
-	if (rule->status == MLX5HWS_RULE_STATUS_CREATING)
+	if (rule->status == MLX5HWS_RULE_STATUS_CREATING ||
+	    rule->status == MLX5HWS_RULE_STATUS_UPDATING)
 		return -EBUSY;
 
 	/* Rule failed and doesn't require cleanup */
@@ -534,7 +493,7 @@ static int hws_rule_destroy_hws(struct mlx5hws_rule *rule,
 		hws_rule_gen_comp(queue, rule, false,
 				  attr->user_data, MLX5HWS_RULE_STATUS_DELETED);
 
-		mlx5hws_rule_free_action_ste(rule);
+		mlx5hws_rule_free_action_ste(&rule->action_ste);
 		mlx5hws_rule_clear_resize_info(rule);
 		return 0;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h
index 495cdd17e9f3..b5ee94ac449b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h
@@ -15,6 +15,8 @@ enum mlx5hws_rule_status {
 	MLX5HWS_RULE_STATUS_UNKNOWN,
 	MLX5HWS_RULE_STATUS_CREATING,
 	MLX5HWS_RULE_STATUS_CREATED,
+	MLX5HWS_RULE_STATUS_UPDATING,
+	MLX5HWS_RULE_STATUS_UPDATED,
 	MLX5HWS_RULE_STATUS_DELETING,
 	MLX5HWS_RULE_STATUS_DELETED,
 	MLX5HWS_RULE_STATUS_FAILING,
@@ -41,13 +43,17 @@ struct mlx5hws_rule_match_tag {
 	};
 };
 
+struct mlx5hws_rule_action_ste_info {
+	struct mlx5hws_pool *pool;
+	int index; /* STE array index */
+	u8 num_stes;
+};
+
 struct mlx5hws_rule_resize_info {
-	struct mlx5hws_pool *action_ste_pool[2];
 	u32 rtc_0;
 	u32 rtc_1;
 	u32 rule_idx;
 	u8 state;
-	u8 max_stes;
 	u8 ctrl_seg[MLX5HWS_WQE_SZ_GTA_CTRL]; /* Ctrl segment of STE: 48 bytes */
 	u8 data_seg[MLX5HWS_WQE_SZ_GTA_DATA]; /* Data segment of STE: 64 bytes */
 };
@@ -58,18 +64,18 @@ struct mlx5hws_rule {
 		struct mlx5hws_rule_match_tag tag;
 		struct mlx5hws_rule_resize_info *resize_info;
 	};
+	struct mlx5hws_rule_action_ste_info action_ste;
+	struct mlx5hws_rule_action_ste_info old_action_ste;
 	u32 rtc_0; /* The RTC into which the STE was inserted */
 	u32 rtc_1; /* The RTC into which the STE was inserted */
-	int action_ste_idx; /* STE array index */
 	u8 status; /* enum mlx5hws_rule_status */
-	u8 action_ste_selector; /* For rule update - which action STE is in use */
 	u8 pending_wqes;
 	bool skip_delete; /* For complex rules - another rule with same tag
 			   * still exists, so don't actually delete this rule.
 			   */
 };
 
-void mlx5hws_rule_free_action_ste(struct mlx5hws_rule *rule);
+void mlx5hws_rule_free_action_ste(struct mlx5hws_rule_action_ste_info *action_ste);
 
 int mlx5hws_rule_move_hws_remove(struct mlx5hws_rule *rule,
 				 void *queue, void *user_data);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c
index 20fe126ffd22..cb6abc4ab7df 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c
@@ -377,17 +377,25 @@ static void hws_send_engine_update_rule(struct mlx5hws_send_engine *queue,
 
 			*status = MLX5HWS_FLOW_OP_ERROR;
 		} else {
-			/* Increase the status, this only works on good flow as the enum
-			 * is arrange it away creating -> created -> deleting -> deleted
+			/* Increase the status, this only works on good flow as
+			 * the enum is arranged this way:
+			 *  - creating -> created
+			 *  - updating -> updated
+			 *  - deleting -> deleted
 			 */
 			priv->rule->status++;
 			*status = MLX5HWS_FLOW_OP_SUCCESS;
-			/* Rule was deleted now we can safely release action STEs
-			 * and clear resize info
-			 */
 			if (priv->rule->status == MLX5HWS_RULE_STATUS_DELETED) {
-				mlx5hws_rule_free_action_ste(priv->rule);
+				/* Rule was deleted, now we can safely release
+				 * action STEs and clear resize info
+				 */
+				mlx5hws_rule_free_action_ste(&priv->rule->action_ste);
 				mlx5hws_rule_clear_resize_info(priv->rule);
+			} else if (priv->rule->status == MLX5HWS_RULE_STATUS_UPDATED) {
+				/* Rule was updated, free the old action STEs */
+				mlx5hws_rule_free_action_ste(&priv->rule->old_action_ste);
+				/* Update completed - move the rule back to "created" */
+				priv->rule->status = MLX5HWS_RULE_STATUS_CREATED;
 			}
 		}
 	}
@@ -633,6 +641,7 @@ static int hws_send_ring_create_sq(struct mlx5_core_dev *mdev, u32 pdn,
 
 	MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST);
 	MLX5_SET(sqc, sqc, flush_in_error_en, 1);
+	MLX5_SET(sqc, sqc, non_wire, 1);
 
 	ts_format = mlx5_is_real_time_sq(mdev) ? MLX5_TIMESTAMP_FORMAT_REAL_TIME :
 						 MLX5_TIMESTAMP_FORMAT_FREE_RUNNING;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c
index 9576e02d00c3..ab1297531232 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c
@@ -37,6 +37,7 @@ static void hws_table_set_cap_attr(struct mlx5hws_table *tbl,
 }
 
 static int hws_table_up_default_fdb_miss_tbl(struct mlx5hws_table *tbl)
+__must_hold(&tbl->ctx->ctrl_lock)
 {
 	struct mlx5hws_cmd_ft_create_attr ft_attr = {0};
 	struct mlx5hws_cmd_set_fte_attr fte_attr = {0};
@@ -48,8 +49,8 @@ static int hws_table_up_default_fdb_miss_tbl(struct mlx5hws_table *tbl)
 	if (tbl->type != MLX5HWS_TABLE_TYPE_FDB)
 		return 0;
 
-	if (ctx->common_res[tbl_type].default_miss) {
-		ctx->common_res[tbl_type].default_miss->refcount++;
+	if (ctx->common_res.default_miss) {
+		ctx->common_res.default_miss->refcount++;
 		return 0;
 	}
 
@@ -70,29 +71,28 @@ static int hws_table_up_default_fdb_miss_tbl(struct mlx5hws_table *tbl)
 		return -EINVAL;
 	}
 
-	/* ctx->ctrl_lock must be held here */
-	ctx->common_res[tbl_type].default_miss = default_miss;
-	ctx->common_res[tbl_type].default_miss->refcount++;
+	ctx->common_res.default_miss = default_miss;
+	ctx->common_res.default_miss->refcount++;
 
 	return 0;
 }
 
 /* Called under ctx->ctrl_lock */
 static void hws_table_down_default_fdb_miss_tbl(struct mlx5hws_table *tbl)
+__must_hold(&tbl->ctx->ctrl_lock)
 {
 	struct mlx5hws_cmd_forward_tbl *default_miss;
 	struct mlx5hws_context *ctx = tbl->ctx;
-	u8 tbl_type = tbl->type;
 
 	if (tbl->type != MLX5HWS_TABLE_TYPE_FDB)
 		return;
 
-	default_miss = ctx->common_res[tbl_type].default_miss;
+	default_miss = ctx->common_res.default_miss;
 	if (--default_miss->refcount)
 		return;
 
 	mlx5hws_cmd_forward_tbl_destroy(ctx->mdev, default_miss);
-	ctx->common_res[tbl_type].default_miss = NULL;
+	ctx->common_res.default_miss = NULL;
 }
 
 static int hws_table_connect_to_default_miss_tbl(struct mlx5hws_table *tbl, u32 ft_id)
@@ -478,15 +478,9 @@ int mlx5hws_table_set_default_miss(struct mlx5hws_table *tbl,
 	if (old_miss_tbl)
 		list_del_init(&tbl->default_miss.next);
 
-	old_miss_tbl = tbl->default_miss.miss_tbl;
-	if (old_miss_tbl)
-		list_del_init(&old_miss_tbl->default_miss.head);
-
 	if (miss_tbl)
 		list_add(&tbl->default_miss.next, &miss_tbl->default_miss.head);
 
-	mutex_unlock(&ctx->ctrl_lock);
-	return 0;
 out:
 	mutex_unlock(&ctx->ctrl_lock);
 	return ret;
diff --git a/drivers/net/ethernet/meta/fbnic/Makefile b/drivers/net/ethernet/meta/fbnic/Makefile
index 239b2258ec65..ea6214ca48e7 100644
--- a/drivers/net/ethernet/meta/fbnic/Makefile
+++ b/drivers/net/ethernet/meta/fbnic/Makefile
@@ -13,7 +13,6 @@ fbnic-y := fbnic_csr.o \
 	   fbnic_ethtool.o \
 	   fbnic_fw.o \
 	   fbnic_hw_stats.o \
-	   fbnic_hwmon.o \
 	   fbnic_irq.o \
 	   fbnic_mac.o \
 	   fbnic_netdev.o \
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic.h b/drivers/net/ethernet/meta/fbnic/fbnic.h
index 14751f16e125..50f97f5399ff 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic.h
+++ b/drivers/net/ethernet/meta/fbnic/fbnic.h
@@ -24,7 +24,6 @@ struct fbnic_dev {
 	struct device *dev;
 	struct net_device *netdev;
 	struct dentry *dbg_fbd;
-	struct device *hwmon;
 
 	u32 __iomem *uc_addr0;
 	u32 __iomem *uc_addr4;
@@ -42,7 +41,6 @@ struct fbnic_dev {
 
 	struct fbnic_fw_mbx mbx[FBNIC_IPC_MBX_INDICES];
 	struct fbnic_fw_cap fw_cap;
-	struct fbnic_fw_completion *cmpl_data;
 	/* Lock protecting Tx Mailbox queue to prevent possible races */
 	spinlock_t fw_tx_lock;
 
@@ -151,9 +149,6 @@ void fbnic_devlink_unregister(struct fbnic_dev *fbd);
 int fbnic_fw_enable_mbx(struct fbnic_dev *fbd);
 void fbnic_fw_disable_mbx(struct fbnic_dev *fbd);
 
-void fbnic_hwmon_register(struct fbnic_dev *fbd);
-void fbnic_hwmon_unregister(struct fbnic_dev *fbd);
-
 int fbnic_pcs_irq_enable(struct fbnic_dev *fbd);
 void fbnic_pcs_irq_disable(struct fbnic_dev *fbd);
 
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw.h b/drivers/net/ethernet/meta/fbnic/fbnic_fw.h
index 7cd8841920e4..221faf8c6756 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_fw.h
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw.h
@@ -44,13 +44,6 @@ struct fbnic_fw_cap {
 	u8	link_fec;
 };
 
-struct fbnic_fw_completion {
-	struct {
-		s32 millivolts;
-		s32 millidegrees;
-	} tsene;
-};
-
 void fbnic_mbx_init(struct fbnic_dev *fbd);
 void fbnic_mbx_clean(struct fbnic_dev *fbd);
 void fbnic_mbx_poll(struct fbnic_dev *fbd);
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_hwmon.c b/drivers/net/ethernet/meta/fbnic/fbnic_hwmon.c
deleted file mode 100644
index bcd1086e3768..000000000000
--- a/drivers/net/ethernet/meta/fbnic/fbnic_hwmon.c
+++ /dev/null
@@ -1,81 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) Meta Platforms, Inc. and affiliates. */
-
-#include <linux/hwmon.h>
-
-#include "fbnic.h"
-#include "fbnic_mac.h"
-
-static int fbnic_hwmon_sensor_id(enum hwmon_sensor_types type)
-{
-	if (type == hwmon_temp)
-		return FBNIC_SENSOR_TEMP;
-	if (type == hwmon_in)
-		return FBNIC_SENSOR_VOLTAGE;
-
-	return -EOPNOTSUPP;
-}
-
-static umode_t fbnic_hwmon_is_visible(const void *drvdata,
-				      enum hwmon_sensor_types type,
-				      u32 attr, int channel)
-{
-	if (type == hwmon_temp && attr == hwmon_temp_input)
-		return 0444;
-	if (type == hwmon_in && attr == hwmon_in_input)
-		return 0444;
-
-	return 0;
-}
-
-static int fbnic_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
-			    u32 attr, int channel, long *val)
-{
-	struct fbnic_dev *fbd = dev_get_drvdata(dev);
-	const struct fbnic_mac *mac = fbd->mac;
-	int id;
-
-	id = fbnic_hwmon_sensor_id(type);
-	return id < 0 ? id : mac->get_sensor(fbd, id, val);
-}
-
-static const struct hwmon_ops fbnic_hwmon_ops = {
-	.is_visible = fbnic_hwmon_is_visible,
-	.read = fbnic_hwmon_read,
-};
-
-static const struct hwmon_channel_info *fbnic_hwmon_info[] = {
-	HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT),
-	HWMON_CHANNEL_INFO(in, HWMON_I_INPUT),
-	NULL
-};
-
-static const struct hwmon_chip_info fbnic_chip_info = {
-	.ops = &fbnic_hwmon_ops,
-	.info = fbnic_hwmon_info,
-};
-
-void fbnic_hwmon_register(struct fbnic_dev *fbd)
-{
-	if (!IS_REACHABLE(CONFIG_HWMON))
-		return;
-
-	fbd->hwmon = hwmon_device_register_with_info(fbd->dev, "fbnic",
-						     fbd, &fbnic_chip_info,
-						     NULL);
-	if (IS_ERR(fbd->hwmon)) {
-		dev_notice(fbd->dev,
-			   "Failed to register hwmon device %pe\n",
-			fbd->hwmon);
-		fbd->hwmon = NULL;
-	}
-}
-
-void fbnic_hwmon_unregister(struct fbnic_dev *fbd)
-{
-	if (!IS_REACHABLE(CONFIG_HWMON) || !fbd->hwmon)
-		return;
-
-	hwmon_device_unregister(fbd->hwmon);
-	fbd->hwmon = NULL;
-}
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_mac.c b/drivers/net/ethernet/meta/fbnic/fbnic_mac.c
index 80b82ff12c4d..7b654d0a6dac 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_mac.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_mac.c
@@ -686,27 +686,6 @@ fbnic_mac_get_eth_mac_stats(struct fbnic_dev *fbd, bool reset,
 			    MAC_STAT_TX_BROADCAST);
 }
 
-static int fbnic_mac_get_sensor_asic(struct fbnic_dev *fbd, int id, long *val)
-{
-	struct fbnic_fw_completion fw_cmpl;
-	s32 *sensor;
-
-	switch (id) {
-	case FBNIC_SENSOR_TEMP:
-		sensor = &fw_cmpl.tsene.millidegrees;
-		break;
-	case FBNIC_SENSOR_VOLTAGE:
-		sensor = &fw_cmpl.tsene.millivolts;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	*val = *sensor;
-
-	return 0;
-}
-
 static const struct fbnic_mac fbnic_mac_asic = {
 	.init_regs = fbnic_mac_init_regs,
 	.pcs_enable = fbnic_pcs_enable_asic,
@@ -716,7 +695,6 @@ static const struct fbnic_mac fbnic_mac_asic = {
 	.get_eth_mac_stats = fbnic_mac_get_eth_mac_stats,
 	.link_down = fbnic_mac_link_down_asic,
 	.link_up = fbnic_mac_link_up_asic,
-	.get_sensor = fbnic_mac_get_sensor_asic,
 };
 
 /**
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_mac.h b/drivers/net/ethernet/meta/fbnic/fbnic_mac.h
index 05a591653e09..476239a9d381 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_mac.h
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_mac.h
@@ -47,11 +47,6 @@ enum {
 #define FBNIC_LINK_MODE_PAM4	(FBNIC_LINK_50R1)
 #define FBNIC_LINK_MODE_MASK	(FBNIC_LINK_AUTO - 1)
 
-enum fbnic_sensor_id {
-	FBNIC_SENSOR_TEMP,		/* Temp in millidegrees Centigrade */
-	FBNIC_SENSOR_VOLTAGE,		/* Voltage in millivolts */
-};
-
 /* This structure defines the interface hooks for the MAC. The MAC hooks
  * will be configured as a const struct provided with a set of function
  * pointers.
@@ -88,8 +83,6 @@ struct fbnic_mac {
 
 	void (*link_down)(struct fbnic_dev *fbd);
 	void (*link_up)(struct fbnic_dev *fbd, bool tx_pause, bool rx_pause);
-
-	int (*get_sensor)(struct fbnic_dev *fbd, int id, long *val);
 };
 
 int fbnic_mac_init(struct fbnic_dev *fbd);
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c
index 6cbbc2ee3e1f..2c96980d150d 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c
@@ -296,8 +296,6 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	/* Capture snapshot of hardware stats so netdev can calculate delta */
 	fbnic_reset_hw_stats(fbd);
 
-	fbnic_hwmon_register(fbd);
-
 	if (!fbd->dsn) {
 		dev_warn(&pdev->dev, "Reading serial number failed\n");
 		goto init_failure_mode;
@@ -360,7 +358,6 @@ static void fbnic_remove(struct pci_dev *pdev)
 		fbnic_netdev_free(fbd);
 	}
 
-	fbnic_hwmon_unregister(fbd);
 	fbnic_dbg_fbd_exit(fbd);
 	fbnic_devlink_unregister(fbd);
 	fbnic_fw_disable_mbx(fbd);
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c
index bb54ce5f5787..d4d7027df9a0 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c
@@ -1033,7 +1033,7 @@ static int fbnic_poll(struct napi_struct *napi, int budget)
 	if (likely(napi_complete_done(napi, work_done)))
 		fbnic_nv_irq_rearm(nv);
 
-	return 0;
+	return work_done;
 }
 
 irqreturn_t fbnic_msix_clean_rings(int __always_unused irq, void *data)
diff --git a/drivers/net/ethernet/realtek/r8169.h b/drivers/net/ethernet/realtek/r8169.h
index e0817f2a311a..7a194a8ab989 100644
--- a/drivers/net/ethernet/realtek/r8169.h
+++ b/drivers/net/ethernet/realtek/r8169.h
@@ -70,6 +70,7 @@ enum mac_version {
 	RTL_GIGA_MAC_VER_63,
 	RTL_GIGA_MAC_VER_64,
 	RTL_GIGA_MAC_VER_65,
+	RTL_GIGA_MAC_VER_66,
 	RTL_GIGA_MAC_VER_70,
 	RTL_GIGA_MAC_VER_71,
 	RTL_GIGA_MAC_NONE
diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index 5724f650f9c6..4b77f2151204 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -58,6 +58,7 @@
 #define FIRMWARE_8125B_2	"rtl_nic/rtl8125b-2.fw"
 #define FIRMWARE_8125D_1	"rtl_nic/rtl8125d-1.fw"
 #define FIRMWARE_8125D_2	"rtl_nic/rtl8125d-2.fw"
+#define FIRMWARE_8125BP_2	"rtl_nic/rtl8125bp-2.fw"
 #define FIRMWARE_8126A_2	"rtl_nic/rtl8126a-2.fw"
 #define FIRMWARE_8126A_3	"rtl_nic/rtl8126a-3.fw"
 
@@ -142,6 +143,7 @@ static const struct {
 	[RTL_GIGA_MAC_VER_63] = {"RTL8125B",		FIRMWARE_8125B_2},
 	[RTL_GIGA_MAC_VER_64] = {"RTL8125D",		FIRMWARE_8125D_1},
 	[RTL_GIGA_MAC_VER_65] = {"RTL8125D",		FIRMWARE_8125D_2},
+	[RTL_GIGA_MAC_VER_66] = {"RTL8125BP",		FIRMWARE_8125BP_2},
 	[RTL_GIGA_MAC_VER_70] = {"RTL8126A",		FIRMWARE_8126A_2},
 	[RTL_GIGA_MAC_VER_71] = {"RTL8126A",		FIRMWARE_8126A_3},
 };
@@ -632,6 +634,7 @@ enum rtl_dash_type {
 	RTL_DASH_NONE,
 	RTL_DASH_DP,
 	RTL_DASH_EP,
+	RTL_DASH_25_BP,
 };
 
 struct rtl8169_private {
@@ -709,6 +712,7 @@ MODULE_FIRMWARE(FIRMWARE_8125A_3);
 MODULE_FIRMWARE(FIRMWARE_8125B_2);
 MODULE_FIRMWARE(FIRMWARE_8125D_1);
 MODULE_FIRMWARE(FIRMWARE_8125D_2);
+MODULE_FIRMWARE(FIRMWARE_8125BP_2);
 MODULE_FIRMWARE(FIRMWARE_8126A_2);
 MODULE_FIRMWARE(FIRMWARE_8126A_3);
 
@@ -1361,10 +1365,19 @@ static void rtl8168ep_driver_start(struct rtl8169_private *tp)
 		rtl_loop_wait_high(tp, &rtl_ep_ocp_read_cond, 10000, 30);
 }
 
+static void rtl8125bp_driver_start(struct rtl8169_private *tp)
+{
+	r8168ep_ocp_write(tp, 0x01, 0x14, OOB_CMD_DRIVER_START);
+	r8168ep_ocp_write(tp, 0x01, 0x18, 0x00);
+	r8168ep_ocp_write(tp, 0x01, 0x10, 0x01);
+}
+
 static void rtl8168_driver_start(struct rtl8169_private *tp)
 {
 	if (tp->dash_type == RTL_DASH_DP)
 		rtl8168dp_driver_start(tp);
+	else if (tp->dash_type == RTL_DASH_25_BP)
+		rtl8125bp_driver_start(tp);
 	else
 		rtl8168ep_driver_start(tp);
 }
@@ -1385,10 +1398,19 @@ static void rtl8168ep_driver_stop(struct rtl8169_private *tp)
 		rtl_loop_wait_low(tp, &rtl_ep_ocp_read_cond, 10000, 10);
 }
 
+static void rtl8125bp_driver_stop(struct rtl8169_private *tp)
+{
+	r8168ep_ocp_write(tp, 0x01, 0x14, OOB_CMD_DRIVER_STOP);
+	r8168ep_ocp_write(tp, 0x01, 0x18, 0x00);
+	r8168ep_ocp_write(tp, 0x01, 0x10, 0x01);
+}
+
 static void rtl8168_driver_stop(struct rtl8169_private *tp)
 {
 	if (tp->dash_type == RTL_DASH_DP)
 		rtl8168dp_driver_stop(tp);
+	else if (tp->dash_type == RTL_DASH_25_BP)
+		rtl8125bp_driver_stop(tp);
 	else
 		rtl8168ep_driver_stop(tp);
 }
@@ -1411,6 +1433,7 @@ static bool rtl_dash_is_enabled(struct rtl8169_private *tp)
 	case RTL_DASH_DP:
 		return r8168dp_check_dash(tp);
 	case RTL_DASH_EP:
+	case RTL_DASH_25_BP:
 		return r8168ep_check_dash(tp);
 	default:
 		return false;
@@ -1425,6 +1448,8 @@ static enum rtl_dash_type rtl_get_dash_type(struct rtl8169_private *tp)
 		return RTL_DASH_DP;
 	case RTL_GIGA_MAC_VER_51 ... RTL_GIGA_MAC_VER_53:
 		return RTL_DASH_EP;
+	case RTL_GIGA_MAC_VER_66:
+		return RTL_DASH_25_BP;
 	default:
 		return RTL_DASH_NONE;
 	}
@@ -2261,6 +2286,9 @@ static enum mac_version rtl8169_get_mac_version(u16 xid, bool gmii)
 		{ 0x7cf, 0x64a,	RTL_GIGA_MAC_VER_71 },
 		{ 0x7cf, 0x649,	RTL_GIGA_MAC_VER_70 },
 
+		/* 8125BP family. */
+		{ 0x7cf, 0x681,	RTL_GIGA_MAC_VER_66 },
+
 		/* 8125D family. */
 		{ 0x7cf, 0x689,	RTL_GIGA_MAC_VER_65 },
 		{ 0x7cf, 0x688,	RTL_GIGA_MAC_VER_64 },
@@ -3842,6 +3870,7 @@ static void rtl_hw_config(struct rtl8169_private *tp)
 		[RTL_GIGA_MAC_VER_63] = rtl_hw_start_8125b,
 		[RTL_GIGA_MAC_VER_64] = rtl_hw_start_8125d,
 		[RTL_GIGA_MAC_VER_65] = rtl_hw_start_8125d,
+		[RTL_GIGA_MAC_VER_66] = rtl_hw_start_8125d,
 		[RTL_GIGA_MAC_VER_70] = rtl_hw_start_8126a,
 		[RTL_GIGA_MAC_VER_71] = rtl_hw_start_8126a,
 	};
@@ -3861,6 +3890,7 @@ static void rtl_hw_start_8125(struct rtl8169_private *tp)
 	case RTL_GIGA_MAC_VER_61:
 	case RTL_GIGA_MAC_VER_64:
 	case RTL_GIGA_MAC_VER_65:
+	case RTL_GIGA_MAC_VER_66:
 		for (i = 0xa00; i < 0xb00; i += 4)
 			RTL_W32(tp, i, 0);
 		break;
diff --git a/drivers/net/ethernet/realtek/r8169_phy_config.c b/drivers/net/ethernet/realtek/r8169_phy_config.c
index 968c8a2185a4..cf95e579c65d 100644
--- a/drivers/net/ethernet/realtek/r8169_phy_config.c
+++ b/drivers/net/ethernet/realtek/r8169_phy_config.c
@@ -1102,6 +1102,28 @@ static void rtl8125d_hw_phy_config(struct rtl8169_private *tp,
 	rtl8125_config_eee_phy(phydev);
 }
 
+static void rtl8125bp_hw_phy_config(struct rtl8169_private *tp,
+				    struct phy_device *phydev)
+{
+	r8169_apply_firmware(tp);
+	rtl8168g_enable_gphy_10m(phydev);
+
+	r8168g_phy_param(phydev, 0x8010, 0x0800, 0x0000);
+
+	phy_write(phydev, 0x1f, 0x0b87);
+	phy_write(phydev, 0x16, 0x8088);
+	phy_modify(phydev, 0x17, 0xff00, 0x9000);
+	phy_write(phydev, 0x16, 0x808f);
+	phy_modify(phydev, 0x17, 0xff00, 0x9000);
+	phy_write(phydev, 0x1f, 0x0000);
+
+	r8168g_phy_param(phydev, 0x8174, 0x2000, 0x1800);
+
+	rtl8125_legacy_force_mode(phydev);
+	rtl8168g_disable_aldps(phydev);
+	rtl8125_config_eee_phy(phydev);
+}
+
 static void rtl8126a_hw_phy_config(struct rtl8169_private *tp,
 				   struct phy_device *phydev)
 {
@@ -1163,6 +1185,7 @@ void r8169_hw_phy_config(struct rtl8169_private *tp, struct phy_device *phydev,
 		[RTL_GIGA_MAC_VER_63] = rtl8125b_hw_phy_config,
 		[RTL_GIGA_MAC_VER_64] = rtl8125d_hw_phy_config,
 		[RTL_GIGA_MAC_VER_65] = rtl8125d_hw_phy_config,
+		[RTL_GIGA_MAC_VER_66] = rtl8125bp_hw_phy_config,
 		[RTL_GIGA_MAC_VER_70] = rtl8126a_hw_phy_config,
 		[RTL_GIGA_MAC_VER_71] = rtl8126a_hw_phy_config,
 	};
diff --git a/drivers/net/ethernet/realtek/rtase/rtase_main.c b/drivers/net/ethernet/realtek/rtase/rtase_main.c
index 585d0b21c9e0..3bd11cb56294 100644
--- a/drivers/net/ethernet/realtek/rtase/rtase_main.c
+++ b/drivers/net/ethernet/realtek/rtase/rtase_main.c
@@ -1828,7 +1828,7 @@ static int rtase_alloc_msix(struct pci_dev *pdev, struct rtase_private *tp)
 
 	for (i = 0; i < tp->int_nums; i++) {
 		irq = pci_irq_vector(pdev, i);
-		if (!irq) {
+		if (irq < 0) {
 			pci_disable_msix(pdev);
 			return irq;
 		}
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c
index eabc4da9e1a9..de9b6dfef15b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c
@@ -313,7 +313,6 @@ static void sti_dwmac_remove(struct platform_device *pdev)
 	clk_disable_unprepare(dwmac->clk);
 }
 
-#ifdef CONFIG_PM_SLEEP
 static int sti_dwmac_suspend(struct device *dev)
 {
 	struct sti_dwmac *dwmac = get_stmmac_bsp_priv(dev);
@@ -333,10 +332,9 @@ static int sti_dwmac_resume(struct device *dev)
 
 	return stmmac_resume(dev);
 }
-#endif /* CONFIG_PM_SLEEP */
 
-static SIMPLE_DEV_PM_OPS(sti_dwmac_pm_ops, sti_dwmac_suspend,
-					   sti_dwmac_resume);
+static DEFINE_SIMPLE_DEV_PM_OPS(sti_dwmac_pm_ops, sti_dwmac_suspend,
+						  sti_dwmac_resume);
 
 static const struct sti_dwmac_of_data stih4xx_dwmac_data = {
 	.fix_retime_src = stih4xx_fix_retime_src,
@@ -353,7 +351,7 @@ static struct platform_driver sti_dwmac_driver = {
 	.remove = sti_dwmac_remove,
 	.driver = {
 		.name           = "sti-dwmac",
-		.pm		= &sti_dwmac_pm_ops,
+		.pm		= pm_sleep_ptr(&sti_dwmac_pm_ops),
 		.of_match_table = sti_dwmac_match,
 	},
 };
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c
index 3827997d2132..dc903b846b1b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
+#include <linux/iommu.h>
 #include <linux/platform_device.h>
 #include <linux/of.h>
 #include <linux/module.h>
@@ -19,6 +20,8 @@ struct tegra_mgbe {
 	struct reset_control *rst_mac;
 	struct reset_control *rst_pcs;
 
+	u32 iommu_sid;
+
 	void __iomem *hv;
 	void __iomem *regs;
 	void __iomem *xpcs;
@@ -50,7 +53,6 @@ struct tegra_mgbe {
 #define MGBE_WRAP_COMMON_INTR_ENABLE	0x8704
 #define MAC_SBD_INTR			BIT(2)
 #define MGBE_WRAP_AXI_ASID0_CTRL	0x8400
-#define MGBE_SID			0x6
 
 static int __maybe_unused tegra_mgbe_suspend(struct device *dev)
 {
@@ -84,7 +86,7 @@ static int __maybe_unused tegra_mgbe_resume(struct device *dev)
 	writel(MAC_SBD_INTR, mgbe->regs + MGBE_WRAP_COMMON_INTR_ENABLE);
 
 	/* Program SID */
-	writel(MGBE_SID, mgbe->hv + MGBE_WRAP_AXI_ASID0_CTRL);
+	writel(mgbe->iommu_sid, mgbe->hv + MGBE_WRAP_AXI_ASID0_CTRL);
 
 	value = readl(mgbe->xpcs + XPCS_WRAP_UPHY_STATUS);
 	if ((value & XPCS_WRAP_UPHY_STATUS_TX_P_UP) == 0) {
@@ -241,6 +243,12 @@ static int tegra_mgbe_probe(struct platform_device *pdev)
 	if (IS_ERR(mgbe->xpcs))
 		return PTR_ERR(mgbe->xpcs);
 
+	/* get controller's stream id from iommu property in device tree */
+	if (!tegra_dev_iommu_get_stream_id(mgbe->dev, &mgbe->iommu_sid)) {
+		dev_err(mgbe->dev, "failed to get iommu stream id\n");
+		return -EINVAL;
+	}
+
 	res.addr = mgbe->regs;
 	res.irq = irq;
 
@@ -346,7 +354,7 @@ static int tegra_mgbe_probe(struct platform_device *pdev)
 	writel(MAC_SBD_INTR, mgbe->regs + MGBE_WRAP_COMMON_INTR_ENABLE);
 
 	/* Program SID */
-	writel(MGBE_SID, mgbe->hv + MGBE_WRAP_AXI_ASID0_CTRL);
+	writel(mgbe->iommu_sid, mgbe->hv + MGBE_WRAP_AXI_ASID0_CTRL);
 
 	plat->flags |= STMMAC_FLAG_SERDES_UP_AFTER_PHY_LINKUP;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index c36f90a782c5..9ed8620580a8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -420,10 +420,10 @@ static void dwmac4_set_eee_pls(struct mac_device_info *hw, int link)
 	writel(value, ioaddr + GMAC4_LPI_CTRL_STATUS);
 }
 
-static void dwmac4_set_eee_lpi_entry_timer(struct mac_device_info *hw, int et)
+static void dwmac4_set_eee_lpi_entry_timer(struct mac_device_info *hw, u32 et)
 {
 	void __iomem *ioaddr = hw->pcsr;
-	int value = et & STMMAC_ET_MAX;
+	u32 value = et & STMMAC_ET_MAX;
 	int regval;
 
 	/* Program LPI entry timer value into register */
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index 2f7295b6c1c5..0f200b72c225 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -363,7 +363,7 @@ struct stmmac_ops {
 	void (*set_eee_mode)(struct mac_device_info *hw,
 			     bool en_tx_lpi_clockgating);
 	void (*reset_eee_mode)(struct mac_device_info *hw);
-	void (*set_eee_lpi_entry_timer)(struct mac_device_info *hw, int et);
+	void (*set_eee_lpi_entry_timer)(struct mac_device_info *hw, u32 et);
 	void (*set_eee_timer)(struct mac_device_info *hw, int ls, int tw);
 	void (*set_eee_pls)(struct mac_device_info *hw, int link);
 	void (*debug)(struct stmmac_priv *priv, void __iomem *ioaddr,
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index b8d631e559c0..e8dbce20129c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -305,11 +305,9 @@ struct stmmac_priv {
 	int clk_csr;
 	struct timer_list eee_ctrl_timer;
 	int lpi_irq;
-	int eee_enabled;
-	int eee_active;
-	int tx_lpi_timer;
-	int tx_lpi_enabled;
-	int eee_tw_timer;
+	u32 tx_lpi_timer;
+	bool eee_enabled;
+	bool eee_active;
 	bool eee_sw_timer_en;
 	unsigned int mode;
 	unsigned int chain_mode;
@@ -405,8 +403,6 @@ void stmmac_dvr_remove(struct device *dev);
 int stmmac_dvr_probe(struct device *device,
 		     struct plat_stmmacenet_data *plat_dat,
 		     struct stmmac_resources *res);
-void stmmac_disable_eee_mode(struct stmmac_priv *priv);
-bool stmmac_eee_init(struct stmmac_priv *priv);
 int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt);
 int stmmac_reinit_ringparam(struct net_device *dev, u32 rx_size, u32 tx_size);
 int stmmac_bus_clks_config(struct stmmac_priv *priv, bool enabled);
@@ -416,14 +412,6 @@ static inline bool stmmac_xdp_is_enabled(struct stmmac_priv *priv)
 	return !!priv->xdp_prog;
 }
 
-static inline unsigned int stmmac_rx_offset(struct stmmac_priv *priv)
-{
-	if (stmmac_xdp_is_enabled(priv))
-		return XDP_PACKET_HEADROOM;
-
-	return 0;
-}
-
 void stmmac_disable_rx_queue(struct stmmac_priv *priv, u32 queue);
 void stmmac_enable_rx_queue(struct stmmac_priv *priv, u32 queue);
 void stmmac_disable_tx_queue(struct stmmac_priv *priv, u32 queue);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index 16b4d8c21c90..918a32f8fda8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -654,7 +654,7 @@ static void stmmac_get_ethtool_stats(struct net_device *dev,
 					     (*(u32 *)p);
 			}
 		}
-		if (priv->eee_enabled) {
+		if (priv->dma_cap.eee) {
 			int val = phylink_get_eee_err(priv->phylink);
 			if (val)
 				priv->xstats.phy_eee_wakeup_error_n = val;
@@ -898,9 +898,6 @@ static int stmmac_ethtool_op_get_eee(struct net_device *dev,
 	if (!priv->dma_cap.eee)
 		return -EOPNOTSUPP;
 
-	edata->tx_lpi_timer = priv->tx_lpi_timer;
-	edata->tx_lpi_enabled = priv->tx_lpi_enabled;
-
 	return phylink_ethtool_get_eee(priv->phylink, edata);
 }
 
@@ -908,29 +905,11 @@ static int stmmac_ethtool_op_set_eee(struct net_device *dev,
 				     struct ethtool_keee *edata)
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
-	int ret;
 
 	if (!priv->dma_cap.eee)
 		return -EOPNOTSUPP;
 
-	if (priv->tx_lpi_enabled != edata->tx_lpi_enabled)
-		netdev_warn(priv->dev,
-			    "Setting EEE tx-lpi is not supported\n");
-
-	if (!edata->eee_enabled)
-		stmmac_disable_eee_mode(priv);
-
-	ret = phylink_ethtool_set_eee(priv->phylink, edata);
-	if (ret)
-		return ret;
-
-	if (edata->eee_enabled &&
-	    priv->tx_lpi_timer != edata->tx_lpi_timer) {
-		priv->tx_lpi_timer = edata->tx_lpi_timer;
-		stmmac_eee_init(priv);
-	}
-
-	return 0;
+	return phylink_ethtool_set_eee(priv->phylink, edata);
 }
 
 static u32 stmmac_usec2riwt(u32 usec, struct stmmac_priv *priv)
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 99eaec8bac4a..58b013528dea 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -111,8 +111,8 @@ static const u32 default_msg_level = (NETIF_MSG_DRV | NETIF_MSG_PROBE |
 				      NETIF_MSG_IFDOWN | NETIF_MSG_TIMER);
 
 #define STMMAC_DEFAULT_LPI_TIMER	1000
-static int eee_timer = STMMAC_DEFAULT_LPI_TIMER;
-module_param(eee_timer, int, 0644);
+static unsigned int eee_timer = STMMAC_DEFAULT_LPI_TIMER;
+module_param(eee_timer, uint, 0644);
 MODULE_PARM_DESC(eee_timer, "LPI tx expiration time in msec");
 #define STMMAC_LPI_T(x) (jiffies + usecs_to_jiffies(x))
 
@@ -194,8 +194,6 @@ static void stmmac_verify_args(void)
 		flow_ctrl = FLOW_OFF;
 	if (unlikely((pause < 0) || (pause > 0xffff)))
 		pause = PAUSE_TIME;
-	if (eee_timer < 0)
-		eee_timer = STMMAC_DEFAULT_LPI_TIMER;
 }
 
 static void __stmmac_disable_all_queues(struct stmmac_priv *priv)
@@ -392,14 +390,14 @@ static inline u32 stmmac_rx_dirty(struct stmmac_priv *priv, u32 queue)
 	return dirty;
 }
 
-static void stmmac_lpi_entry_timer_config(struct stmmac_priv *priv, bool en)
+static void stmmac_disable_hw_lpi_timer(struct stmmac_priv *priv)
 {
-	int tx_lpi_timer;
+	stmmac_set_eee_lpi_timer(priv, priv->hw, 0);
+}
 
-	/* Clear/set the SW EEE timer flag based on LPI ET enablement */
-	priv->eee_sw_timer_en = en ? 0 : 1;
-	tx_lpi_timer  = en ? priv->tx_lpi_timer : 0;
-	stmmac_set_eee_lpi_timer(priv, priv->hw, tx_lpi_timer);
+static void stmmac_enable_hw_lpi_timer(struct stmmac_priv *priv)
+{
+	stmmac_set_eee_lpi_timer(priv, priv->hw, priv->tx_lpi_timer);
 }
 
 /**
@@ -429,18 +427,13 @@ static int stmmac_enable_eee_mode(struct stmmac_priv *priv)
 }
 
 /**
- * stmmac_disable_eee_mode - disable and exit from LPI mode
+ * stmmac_disable_sw_eee_mode - disable and exit from LPI mode
  * @priv: driver private structure
  * Description: this function is to exit and disable EEE in case of
  * LPI state is true. This is called by the xmit.
  */
-void stmmac_disable_eee_mode(struct stmmac_priv *priv)
+static void stmmac_disable_sw_eee_mode(struct stmmac_priv *priv)
 {
-	if (!priv->eee_sw_timer_en) {
-		stmmac_lpi_entry_timer_config(priv, 0);
-		return;
-	}
-
 	stmmac_reset_eee_mode(priv, priv->hw);
 	del_timer_sync(&priv->eee_ctrl_timer);
 	priv->tx_path_in_lpi_mode = false;
@@ -464,18 +457,21 @@ static void stmmac_eee_ctrl_timer(struct timer_list *t)
 /**
  * stmmac_eee_init - init EEE
  * @priv: driver private structure
+ * @active: indicates whether EEE should be enabled.
  * Description:
  *  if the GMAC supports the EEE (from the HW cap reg) and the phy device
  *  can also manage EEE, this function enable the LPI state and start related
  *  timer.
  */
-bool stmmac_eee_init(struct stmmac_priv *priv)
+static void stmmac_eee_init(struct stmmac_priv *priv, bool active)
 {
-	int eee_tw_timer = priv->eee_tw_timer;
+	priv->eee_active = active;
 
 	/* Check if MAC core supports the EEE feature. */
-	if (!priv->dma_cap.eee)
-		return false;
+	if (!priv->dma_cap.eee) {
+		priv->eee_enabled = false;
+		return;
+	}
 
 	mutex_lock(&priv->lock);
 
@@ -483,22 +479,24 @@ bool stmmac_eee_init(struct stmmac_priv *priv)
 	if (!priv->eee_active) {
 		if (priv->eee_enabled) {
 			netdev_dbg(priv->dev, "disable EEE\n");
-			stmmac_lpi_entry_timer_config(priv, 0);
+			priv->eee_sw_timer_en = true;
+			stmmac_disable_hw_lpi_timer(priv);
 			del_timer_sync(&priv->eee_ctrl_timer);
-			stmmac_set_eee_timer(priv, priv->hw, 0, eee_tw_timer);
+			stmmac_set_eee_timer(priv, priv->hw, 0,
+					     STMMAC_DEFAULT_TWT_LS);
 			if (priv->hw->xpcs)
 				xpcs_config_eee(priv->hw->xpcs,
 						priv->plat->mult_fact_100ns,
 						false);
 		}
+		priv->eee_enabled = false;
 		mutex_unlock(&priv->lock);
-		return false;
+		return;
 	}
 
 	if (priv->eee_active && !priv->eee_enabled) {
-		timer_setup(&priv->eee_ctrl_timer, stmmac_eee_ctrl_timer, 0);
 		stmmac_set_eee_timer(priv, priv->hw, STMMAC_DEFAULT_LIT_LS,
-				     eee_tw_timer);
+				     STMMAC_DEFAULT_TWT_LS);
 		if (priv->hw->xpcs)
 			xpcs_config_eee(priv->hw->xpcs,
 					priv->plat->mult_fact_100ns,
@@ -506,18 +504,23 @@ bool stmmac_eee_init(struct stmmac_priv *priv)
 	}
 
 	if (priv->plat->has_gmac4 && priv->tx_lpi_timer <= STMMAC_ET_MAX) {
+		/* Use hardware LPI mode */
 		del_timer_sync(&priv->eee_ctrl_timer);
 		priv->tx_path_in_lpi_mode = false;
-		stmmac_lpi_entry_timer_config(priv, 1);
+		priv->eee_sw_timer_en = false;
+		stmmac_enable_hw_lpi_timer(priv);
 	} else {
-		stmmac_lpi_entry_timer_config(priv, 0);
+		/* Use software LPI mode */
+		priv->eee_sw_timer_en = true;
+		stmmac_disable_hw_lpi_timer(priv);
 		mod_timer(&priv->eee_ctrl_timer,
 			  STMMAC_LPI_T(priv->tx_lpi_timer));
 	}
 
+	priv->eee_enabled = true;
+
 	mutex_unlock(&priv->lock);
 	netdev_dbg(priv->dev, "Energy-Efficient Ethernet initialized\n");
-	return true;
 }
 
 /* stmmac_get_tx_hwtstamp - get HW TX timestamps
@@ -974,9 +977,7 @@ static void stmmac_mac_link_down(struct phylink_config *config,
 	struct stmmac_priv *priv = netdev_priv(to_net_dev(config->dev));
 
 	stmmac_mac_set(priv, priv->ioaddr, false);
-	priv->eee_active = false;
-	priv->tx_lpi_enabled = false;
-	priv->eee_enabled = stmmac_eee_init(priv);
+	stmmac_eee_init(priv, false);
 	stmmac_set_eee_pls(priv, priv->hw, false);
 
 	if (stmmac_fpe_supported(priv))
@@ -1085,11 +1086,10 @@ static void stmmac_mac_link_up(struct phylink_config *config,
 
 	stmmac_mac_set(priv, priv->ioaddr, true);
 	if (phy && priv->dma_cap.eee) {
-		priv->eee_active =
-			phy_init_eee(phy, !(priv->plat->flags &
-				STMMAC_FLAG_RX_CLK_RUNS_IN_LPI)) >= 0;
-		priv->eee_enabled = stmmac_eee_init(priv);
-		priv->tx_lpi_enabled = priv->eee_enabled;
+		phy_eee_rx_clock_stop(phy, !(priv->plat->flags &
+					     STMMAC_FLAG_RX_CLK_RUNS_IN_LPI));
+		priv->tx_lpi_timer = phy->eee_cfg.tx_lpi_timer;
+		stmmac_eee_init(priv, phy->enable_tx_lpi);
 		stmmac_set_eee_pls(priv, priv->hw, true);
 	}
 
@@ -1187,6 +1187,16 @@ static int stmmac_init_phy(struct net_device *dev)
 		ret = phylink_fwnode_phy_connect(priv->phylink, fwnode, 0);
 	}
 
+	if (ret == 0) {
+		struct ethtool_keee eee;
+
+		/* Configure phylib's copy of the LPI timer */
+		if (!phylink_ethtool_get_eee(priv->phylink, &eee)) {
+			eee.tx_lpi_timer = priv->tx_lpi_timer;
+			phylink_ethtool_set_eee(priv->phylink, &eee);
+		}
+	}
+
 	if (!priv->plat->pmt) {
 		struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
 
@@ -1203,6 +1213,7 @@ static int stmmac_phy_setup(struct stmmac_priv *priv)
 	struct stmmac_mdio_bus_data *mdio_bus_data;
 	int mode = priv->plat->phy_interface;
 	struct fwnode_handle *fwnode;
+	struct phylink_pcs *pcs;
 	struct phylink *phylink;
 
 	priv->phylink_config.dev = &priv->dev->dev;
@@ -1224,8 +1235,14 @@ static int stmmac_phy_setup(struct stmmac_priv *priv)
 
 	/* If we have an xpcs, it defines which PHY interfaces are supported. */
 	if (priv->hw->xpcs)
-		xpcs_get_interfaces(priv->hw->xpcs,
-				    priv->phylink_config.supported_interfaces);
+		pcs = xpcs_to_phylink_pcs(priv->hw->xpcs);
+	else
+		pcs = priv->hw->phylink_pcs;
+
+	if (pcs)
+		phy_interface_or(priv->phylink_config.supported_interfaces,
+				 priv->phylink_config.supported_interfaces,
+				 pcs->supported_interfaces);
 
 	fwnode = priv->plat->port_node;
 	if (!fwnode)
@@ -1308,6 +1325,14 @@ static void stmmac_display_rings(struct stmmac_priv *priv,
 	stmmac_display_tx_rings(priv, dma_conf);
 }
 
+static unsigned int stmmac_rx_offset(struct stmmac_priv *priv)
+{
+	if (stmmac_xdp_is_enabled(priv))
+		return XDP_PACKET_HEADROOM;
+
+	return 0;
+}
+
 static int stmmac_set_bfsize(int mtu, int bufsize)
 {
 	int ret = bufsize;
@@ -3437,12 +3462,6 @@ static int stmmac_hw_setup(struct net_device *dev, bool ptp_register)
 	else if (ptp_register)
 		stmmac_ptp_register(priv);
 
-	priv->eee_tw_timer = STMMAC_DEFAULT_TWT_LS;
-
-	/* Convert the timer from msec to usec */
-	if (!priv->tx_lpi_timer)
-		priv->tx_lpi_timer = eee_timer * 1000;
-
 	if (priv->use_riwt) {
 		u32 queue;
 
@@ -3909,6 +3928,10 @@ static int __stmmac_open(struct net_device *dev,
 	u32 chan;
 	int ret;
 
+	/* Initialise the tx lpi timer, converting from msec to usec */
+	if (!priv->tx_lpi_timer)
+		priv->tx_lpi_timer = eee_timer * 1000;
+
 	ret = pm_runtime_resume_and_get(priv->device);
 	if (ret < 0)
 		return ret;
@@ -4023,11 +4046,6 @@ static int stmmac_release(struct net_device *dev)
 	/* Free the IRQ lines */
 	stmmac_free_irq(dev, REQ_IRQ_ERR_ALL, 0);
 
-	if (priv->eee_enabled) {
-		priv->tx_path_in_lpi_mode = false;
-		del_timer_sync(&priv->eee_ctrl_timer);
-	}
-
 	/* Stop TX/RX DMA and clear the descriptors */
 	stmmac_stop_all_dma(priv);
 
@@ -4479,7 +4497,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 	first_tx = tx_q->cur_tx;
 
 	if (priv->tx_path_in_lpi_mode && priv->eee_sw_timer_en)
-		stmmac_disable_eee_mode(priv);
+		stmmac_disable_sw_eee_mode(priv);
 
 	/* Manage oversized TCP frames for GMAC4 device */
 	if (skb_is_gso(skb) && priv->tso) {
@@ -5460,7 +5478,7 @@ read_again:
 		if (priv->extend_desc)
 			stmmac_rx_extended_status(priv, &priv->xstats, rx_q->dma_erx + entry);
 		if (unlikely(status == discard_frame)) {
-			page_pool_recycle_direct(rx_q->page_pool, buf->page);
+			page_pool_put_page(rx_q->page_pool, buf->page, 0, true);
 			buf->page = NULL;
 			error = 1;
 			if (!priv->hwts_rx_en)
@@ -7394,6 +7412,8 @@ int stmmac_dvr_probe(struct device *device,
 
 	INIT_WORK(&priv->service_task, stmmac_service_task);
 
+	timer_setup(&priv->eee_ctrl_timer, stmmac_eee_ctrl_timer, 0);
+
 	/* Override with kernel parameters if supplied XXX CRS XXX
 	 * this needs to have multiple instances
 	 */
diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
index 5465bf872734..dcb6662b473d 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
@@ -762,7 +762,7 @@ static int am65_cpsw_nuss_common_open(struct am65_cpsw_common *common)
 			     ALE_DEFAULT_THREAD_ID, 0);
 	cpsw_ale_control_set(common->ale, HOST_PORT_NUM,
 			     ALE_DEFAULT_THREAD_ENABLE, 1);
-	/* switch to vlan unaware mode */
+	/* switch to vlan aware mode */
 	cpsw_ale_control_set(common->ale, HOST_PORT_NUM, ALE_VLAN_AWARE, 1);
 	cpsw_ale_control_set(common->ale, HOST_PORT_NUM,
 			     ALE_PORT_STATE, ALE_PORT_STATE_FORWARD);
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 1e290ee8edfd..0cb6fa6e5b7d 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -686,7 +686,7 @@ static void cpsw_init_host_port(struct cpsw_priv *priv)
 	soft_reset("cpsw", &cpsw->regs->soft_reset);
 	cpsw_ale_start(cpsw->ale);
 
-	/* switch to vlan unaware mode */
+	/* switch to vlan aware mode */
 	cpsw_ale_control_set(cpsw->ale, HOST_PORT_NUM, ALE_VLAN_AWARE,
 			     CPSW_ALE_VLAN_AWARE);
 	control_reg = readl(&cpsw->regs->control);
diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c
index be4d90c1cbe7..cec0a90659d9 100644
--- a/drivers/net/ethernet/ti/cpsw_new.c
+++ b/drivers/net/ethernet/ti/cpsw_new.c
@@ -554,7 +554,7 @@ static void cpsw_init_host_port(struct cpsw_priv *priv)
 	soft_reset("cpsw", &cpsw->regs->soft_reset);
 	cpsw_ale_start(cpsw->ale);
 
-	/* switch to vlan unaware mode */
+	/* switch to vlan aware mode */
 	cpsw_ale_control_set(cpsw->ale, HOST_PORT_NUM, ALE_VLAN_AWARE,
 			     CPSW_ALE_VLAN_AWARE);
 	control_reg = readl(&cpsw->regs->control);
diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c
index 1bf9c38e4125..deaf670c160e 100644
--- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c
+++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c
@@ -334,27 +334,25 @@ int wx_host_interface_command(struct wx *wx, u32 *buffer,
 	status = read_poll_timeout(rd32, hicr, hicr & WX_MNG_MBOX_CTL_FWRDY, 1000,
 				   timeout * 1000, false, wx, WX_MNG_MBOX_CTL);
 
+	buf[0] = rd32(wx, WX_MNG_MBOX);
+	if ((buf[0] & 0xff0000) >> 16 == 0x80) {
+		wx_err(wx, "Unknown FW command: 0x%x\n", buffer[0] & 0xff);
+		status = -EINVAL;
+		goto rel_out;
+	}
+
 	/* Check command completion */
 	if (status) {
-		wx_dbg(wx, "Command has failed with no status valid.\n");
-
-		buf[0] = rd32(wx, WX_MNG_MBOX);
-		if ((buffer[0] & 0xff) != (~buf[0] >> 24)) {
-			status = -EINVAL;
-			goto rel_out;
-		}
-		if ((buf[0] & 0xff0000) >> 16 == 0x80) {
-			wx_dbg(wx, "It's unknown cmd.\n");
-			status = -EINVAL;
-			goto rel_out;
-		}
-
+		wx_err(wx, "Command has failed with no status valid.\n");
 		wx_dbg(wx, "write value:\n");
 		for (i = 0; i < dword_len; i++)
 			wx_dbg(wx, "%x ", buffer[i]);
 		wx_dbg(wx, "read value:\n");
 		for (i = 0; i < dword_len; i++)
 			wx_dbg(wx, "%x ", buf[i]);
+		wx_dbg(wx, "\ncheck: %x %x\n", buffer[0] & 0xff, ~buf[0] >> 24);
+
+		goto rel_out;
 	}
 
 	if (!return_data)
diff --git a/drivers/net/ieee802154/ca8210.c b/drivers/net/ieee802154/ca8210.c
index e685a7f946f0..753215ebc67c 100644
--- a/drivers/net/ieee802154/ca8210.c
+++ b/drivers/net/ieee802154/ca8210.c
@@ -3072,7 +3072,11 @@ static int ca8210_probe(struct spi_device *spi_device)
 	spi_set_drvdata(priv->spi, priv);
 	if (IS_ENABLED(CONFIG_IEEE802154_CA8210_DEBUGFS)) {
 		cascoda_api_upstream = ca8210_test_int_driver_write;
-		ca8210_test_interface_init(priv);
+		ret = ca8210_test_interface_init(priv);
+		if (ret) {
+			dev_crit(&spi_device->dev, "ca8210_test_interface_init failed\n");
+			goto error;
+		}
 	} else {
 		cascoda_api_upstream = NULL;
 	}
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index ee2c3cf4df36..da3a97a65507 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -799,6 +799,12 @@ static int ipvlan_device_event(struct notifier_block *unused,
 	case NETDEV_PRE_TYPE_CHANGE:
 		/* Forbid underlying device to change its type. */
 		return NOTIFY_BAD;
+
+	case NETDEV_NOTIFY_PEERS:
+	case NETDEV_BONDING_FAILOVER:
+	case NETDEV_RESEND_IGMP:
+		list_for_each_entry(ipvlan, &port->ipvlans, pnode)
+			call_netdevice_notifiers(event, ipvlan->dev);
 	}
 	return NOTIFY_DONE;
 }
diff --git a/drivers/net/mctp/mctp-i3c.c b/drivers/net/mctp/mctp-i3c.c
index 9adad59b8676..d247fe483c58 100644
--- a/drivers/net/mctp/mctp-i3c.c
+++ b/drivers/net/mctp/mctp-i3c.c
@@ -125,6 +125,8 @@ static int mctp_i3c_read(struct mctp_i3c_device *mi)
 
 	xfer.data.in = skb_put(skb, mi->mrl);
 
+	/* Make sure netif_rx() is read in the same order as i3c. */
+	mutex_lock(&mi->lock);
 	rc = i3c_device_do_priv_xfers(mi->i3c, &xfer, 1);
 	if (rc < 0)
 		goto err;
@@ -166,8 +168,10 @@ static int mctp_i3c_read(struct mctp_i3c_device *mi)
 		stats->rx_dropped++;
 	}
 
+	mutex_unlock(&mi->lock);
 	return 0;
 err:
+	mutex_unlock(&mi->lock);
 	kfree_skb(skb);
 	return rc;
 }
diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c
index f422a2f666ef..86ab4a42769a 100644
--- a/drivers/net/netconsole.c
+++ b/drivers/net/netconsole.c
@@ -730,7 +730,7 @@ static void update_userdata(struct netconsole_target *nt)
 		struct userdatum *udm_item;
 		struct config_item *item;
 
-		if (child_count >= MAX_USERDATA_ITEMS)
+		if (WARN_ON_ONCE(child_count >= MAX_USERDATA_ITEMS))
 			break;
 		child_count++;
 
diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index e068a9761c09..d013b6498539 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -20,6 +20,7 @@
 #include <linux/netdevice.h>
 #include <linux/slab.h>
 #include <net/netdev_queues.h>
+#include <net/netdev_rx_queue.h>
 #include <net/page_pool/helpers.h>
 #include <net/netlink.h>
 #include <net/net_shaper.h>
@@ -29,6 +30,8 @@
 
 #include "netdevsim.h"
 
+MODULE_IMPORT_NS("NETDEV_INTERNAL");
+
 #define NSIM_RING_SIZE		256
 
 static int nsim_napi_rx(struct nsim_rq *rq, struct sk_buff *skb)
@@ -69,7 +72,7 @@ static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	rxq = skb_get_queue_mapping(skb);
 	if (rxq >= peer_dev->num_rx_queues)
 		rxq = rxq % peer_dev->num_rx_queues;
-	rq = &peer_ns->rq[rxq];
+	rq = peer_ns->rq[rxq];
 
 	skb_tx_timestamp(skb);
 	if (unlikely(nsim_forward_skb(peer_dev, skb, rq) == NET_RX_DROP))
@@ -359,25 +362,24 @@ static int nsim_poll(struct napi_struct *napi, int budget)
 	return done;
 }
 
-static int nsim_create_page_pool(struct nsim_rq *rq)
+static int nsim_create_page_pool(struct page_pool **p, struct napi_struct *napi)
 {
-	struct page_pool_params p = {
+	struct page_pool_params params = {
 		.order = 0,
 		.pool_size = NSIM_RING_SIZE,
 		.nid = NUMA_NO_NODE,
-		.dev = &rq->napi.dev->dev,
-		.napi = &rq->napi,
+		.dev = &napi->dev->dev,
+		.napi = napi,
 		.dma_dir = DMA_BIDIRECTIONAL,
-		.netdev = rq->napi.dev,
+		.netdev = napi->dev,
 	};
+	struct page_pool *pool;
 
-	rq->page_pool = page_pool_create(&p);
-	if (IS_ERR(rq->page_pool)) {
-		int err = PTR_ERR(rq->page_pool);
+	pool = page_pool_create(&params);
+	if (IS_ERR(pool))
+		return PTR_ERR(pool);
 
-		rq->page_pool = NULL;
-		return err;
-	}
+	*p = pool;
 	return 0;
 }
 
@@ -388,15 +390,15 @@ static int nsim_init_napi(struct netdevsim *ns)
 	int err, i;
 
 	for (i = 0; i < dev->num_rx_queues; i++) {
-		rq = &ns->rq[i];
+		rq = ns->rq[i];
 
-		netif_napi_add(dev, &rq->napi, nsim_poll);
+		netif_napi_add_config(dev, &rq->napi, nsim_poll, i);
 	}
 
 	for (i = 0; i < dev->num_rx_queues; i++) {
-		rq = &ns->rq[i];
+		rq = ns->rq[i];
 
-		err = nsim_create_page_pool(rq);
+		err = nsim_create_page_pool(&rq->page_pool, &rq->napi);
 		if (err)
 			goto err_pp_destroy;
 	}
@@ -405,12 +407,12 @@ static int nsim_init_napi(struct netdevsim *ns)
 
 err_pp_destroy:
 	while (i--) {
-		page_pool_destroy(ns->rq[i].page_pool);
-		ns->rq[i].page_pool = NULL;
+		page_pool_destroy(ns->rq[i]->page_pool);
+		ns->rq[i]->page_pool = NULL;
 	}
 
 	for (i = 0; i < dev->num_rx_queues; i++)
-		__netif_napi_del(&ns->rq[i].napi);
+		__netif_napi_del(&ns->rq[i]->napi);
 
 	return err;
 }
@@ -421,7 +423,7 @@ static void nsim_enable_napi(struct netdevsim *ns)
 	int i;
 
 	for (i = 0; i < dev->num_rx_queues; i++) {
-		struct nsim_rq *rq = &ns->rq[i];
+		struct nsim_rq *rq = ns->rq[i];
 
 		netif_queue_set_napi(dev, i, NETDEV_QUEUE_TYPE_RX, &rq->napi);
 		napi_enable(&rq->napi);
@@ -448,7 +450,7 @@ static void nsim_del_napi(struct netdevsim *ns)
 	int i;
 
 	for (i = 0; i < dev->num_rx_queues; i++) {
-		struct nsim_rq *rq = &ns->rq[i];
+		struct nsim_rq *rq = ns->rq[i];
 
 		napi_disable(&rq->napi);
 		__netif_napi_del(&rq->napi);
@@ -456,8 +458,8 @@ static void nsim_del_napi(struct netdevsim *ns)
 	synchronize_net();
 
 	for (i = 0; i < dev->num_rx_queues; i++) {
-		page_pool_destroy(ns->rq[i].page_pool);
-		ns->rq[i].page_pool = NULL;
+		page_pool_destroy(ns->rq[i]->page_pool);
+		ns->rq[i]->page_pool = NULL;
 	}
 }
 
@@ -595,6 +597,182 @@ static const struct netdev_stat_ops nsim_stat_ops = {
 	.get_base_stats		= nsim_get_base_stats,
 };
 
+static struct nsim_rq *nsim_queue_alloc(void)
+{
+	struct nsim_rq *rq;
+
+	rq = kzalloc(sizeof(*rq), GFP_KERNEL_ACCOUNT);
+	if (!rq)
+		return NULL;
+
+	skb_queue_head_init(&rq->skb_queue);
+	return rq;
+}
+
+static void nsim_queue_free(struct nsim_rq *rq)
+{
+	skb_queue_purge_reason(&rq->skb_queue, SKB_DROP_REASON_QUEUE_PURGE);
+	kfree(rq);
+}
+
+/* Queue reset mode is controlled by ns->rq_reset_mode.
+ * - normal - new NAPI new pool (old NAPI enabled when new added)
+ * - mode 1 - allocate new pool (NAPI is only disabled / enabled)
+ * - mode 2 - new NAPI new pool (old NAPI removed before new added)
+ * - mode 3 - new NAPI new pool (old NAPI disabled when new added)
+ */
+struct nsim_queue_mem {
+	struct nsim_rq *rq;
+	struct page_pool *pp;
+};
+
+static int
+nsim_queue_mem_alloc(struct net_device *dev, void *per_queue_mem, int idx)
+{
+	struct nsim_queue_mem *qmem = per_queue_mem;
+	struct netdevsim *ns = netdev_priv(dev);
+	int err;
+
+	if (ns->rq_reset_mode > 3)
+		return -EINVAL;
+
+	if (ns->rq_reset_mode == 1)
+		return nsim_create_page_pool(&qmem->pp, &ns->rq[idx]->napi);
+
+	qmem->rq = nsim_queue_alloc();
+	if (!qmem->rq)
+		return -ENOMEM;
+
+	err = nsim_create_page_pool(&qmem->rq->page_pool, &qmem->rq->napi);
+	if (err)
+		goto err_free;
+
+	if (!ns->rq_reset_mode)
+		netif_napi_add_config(dev, &qmem->rq->napi, nsim_poll, idx);
+
+	return 0;
+
+err_free:
+	nsim_queue_free(qmem->rq);
+	return err;
+}
+
+static void nsim_queue_mem_free(struct net_device *dev, void *per_queue_mem)
+{
+	struct nsim_queue_mem *qmem = per_queue_mem;
+	struct netdevsim *ns = netdev_priv(dev);
+
+	page_pool_destroy(qmem->pp);
+	if (qmem->rq) {
+		if (!ns->rq_reset_mode)
+			netif_napi_del(&qmem->rq->napi);
+		page_pool_destroy(qmem->rq->page_pool);
+		nsim_queue_free(qmem->rq);
+	}
+}
+
+static int
+nsim_queue_start(struct net_device *dev, void *per_queue_mem, int idx)
+{
+	struct nsim_queue_mem *qmem = per_queue_mem;
+	struct netdevsim *ns = netdev_priv(dev);
+
+	if (ns->rq_reset_mode == 1) {
+		ns->rq[idx]->page_pool = qmem->pp;
+		napi_enable(&ns->rq[idx]->napi);
+		return 0;
+	}
+
+	/* netif_napi_add()/_del() should normally be called from alloc/free,
+	 * here we want to test various call orders.
+	 */
+	if (ns->rq_reset_mode == 2) {
+		netif_napi_del(&ns->rq[idx]->napi);
+		netif_napi_add_config(dev, &qmem->rq->napi, nsim_poll, idx);
+	} else if (ns->rq_reset_mode == 3) {
+		netif_napi_add_config(dev, &qmem->rq->napi, nsim_poll, idx);
+		netif_napi_del(&ns->rq[idx]->napi);
+	}
+
+	ns->rq[idx] = qmem->rq;
+	napi_enable(&ns->rq[idx]->napi);
+
+	return 0;
+}
+
+static int nsim_queue_stop(struct net_device *dev, void *per_queue_mem, int idx)
+{
+	struct nsim_queue_mem *qmem = per_queue_mem;
+	struct netdevsim *ns = netdev_priv(dev);
+
+	napi_disable(&ns->rq[idx]->napi);
+
+	if (ns->rq_reset_mode == 1) {
+		qmem->pp = ns->rq[idx]->page_pool;
+		page_pool_disable_direct_recycling(qmem->pp);
+	} else {
+		qmem->rq = ns->rq[idx];
+	}
+
+	return 0;
+}
+
+static const struct netdev_queue_mgmt_ops nsim_queue_mgmt_ops = {
+	.ndo_queue_mem_size	= sizeof(struct nsim_queue_mem),
+	.ndo_queue_mem_alloc	= nsim_queue_mem_alloc,
+	.ndo_queue_mem_free	= nsim_queue_mem_free,
+	.ndo_queue_start	= nsim_queue_start,
+	.ndo_queue_stop		= nsim_queue_stop,
+};
+
+static ssize_t
+nsim_qreset_write(struct file *file, const char __user *data,
+		  size_t count, loff_t *ppos)
+{
+	struct netdevsim *ns = file->private_data;
+	unsigned int queue, mode;
+	char buf[32];
+	ssize_t ret;
+
+	if (count >= sizeof(buf))
+		return -EINVAL;
+	if (copy_from_user(buf, data, count))
+		return -EFAULT;
+	buf[count] = '\0';
+
+	ret = sscanf(buf, "%u %u", &queue, &mode);
+	if (ret != 2)
+		return -EINVAL;
+
+	rtnl_lock();
+	if (!netif_running(ns->netdev)) {
+		ret = -ENETDOWN;
+		goto exit_unlock;
+	}
+
+	if (queue >= ns->netdev->real_num_rx_queues) {
+		ret = -EINVAL;
+		goto exit_unlock;
+	}
+
+	ns->rq_reset_mode = mode;
+	ret = netdev_rx_queue_restart(ns->netdev, queue);
+	ns->rq_reset_mode = 0;
+	if (ret)
+		goto exit_unlock;
+
+	ret = count;
+exit_unlock:
+	rtnl_unlock();
+	return ret;
+}
+
+static const struct file_operations nsim_qreset_fops = {
+	.open = simple_open,
+	.write = nsim_qreset_write,
+	.owner = THIS_MODULE,
+};
+
 static ssize_t
 nsim_pp_hold_read(struct file *file, char __user *data,
 		  size_t count, loff_t *ppos)
@@ -628,7 +806,7 @@ nsim_pp_hold_write(struct file *file, const char __user *data,
 	if (!netif_running(ns->netdev) && val) {
 		ret = -ENETDOWN;
 	} else if (val) {
-		ns->page = page_pool_dev_alloc_pages(ns->rq[0].page_pool);
+		ns->page = page_pool_dev_alloc_pages(ns->rq[0]->page_pool);
 		if (!ns->page)
 			ret = -ENOMEM;
 	} else {
@@ -677,27 +855,35 @@ static int nsim_queue_init(struct netdevsim *ns)
 	struct net_device *dev = ns->netdev;
 	int i;
 
-	ns->rq = kvcalloc(dev->num_rx_queues, sizeof(*ns->rq),
-			  GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
+	ns->rq = kcalloc(dev->num_rx_queues, sizeof(*ns->rq),
+			 GFP_KERNEL_ACCOUNT);
 	if (!ns->rq)
 		return -ENOMEM;
 
-	for (i = 0; i < dev->num_rx_queues; i++)
-		skb_queue_head_init(&ns->rq[i].skb_queue);
+	for (i = 0; i < dev->num_rx_queues; i++) {
+		ns->rq[i] = nsim_queue_alloc();
+		if (!ns->rq[i])
+			goto err_free_prev;
+	}
 
 	return 0;
+
+err_free_prev:
+	while (i--)
+		kfree(ns->rq[i]);
+	kfree(ns->rq);
+	return -ENOMEM;
 }
 
-static void nsim_queue_free(struct netdevsim *ns)
+static void nsim_queue_uninit(struct netdevsim *ns)
 {
 	struct net_device *dev = ns->netdev;
 	int i;
 
 	for (i = 0; i < dev->num_rx_queues; i++)
-		skb_queue_purge_reason(&ns->rq[i].skb_queue,
-				       SKB_DROP_REASON_QUEUE_PURGE);
+		nsim_queue_free(ns->rq[i]);
 
-	kvfree(ns->rq);
+	kfree(ns->rq);
 	ns->rq = NULL;
 }
 
@@ -713,6 +899,7 @@ static int nsim_init_netdevsim(struct netdevsim *ns)
 	ns->phc = phc;
 	ns->netdev->netdev_ops = &nsim_netdev_ops;
 	ns->netdev->stat_ops = &nsim_stat_ops;
+	ns->netdev->queue_mgmt_ops = &nsim_queue_mgmt_ops;
 
 	err = nsim_udp_tunnels_info_create(ns->nsim_dev, ns->netdev);
 	if (err)
@@ -741,7 +928,7 @@ err_ipsec_teardown:
 	nsim_macsec_teardown(ns);
 	nsim_bpf_uninit(ns);
 err_rq_destroy:
-	nsim_queue_free(ns);
+	nsim_queue_uninit(ns);
 err_utn_destroy:
 	rtnl_unlock();
 	nsim_udp_tunnels_info_destroy(ns->netdev);
@@ -798,6 +985,9 @@ nsim_create(struct nsim_dev *nsim_dev, struct nsim_dev_port *nsim_dev_port)
 
 	ns->pp_dfs = debugfs_create_file("pp_hold", 0600, nsim_dev_port->ddir,
 					 ns, &nsim_pp_hold_fops);
+	ns->qr_dfs = debugfs_create_file("queue_reset", 0200,
+					 nsim_dev_port->ddir, ns,
+					 &nsim_qreset_fops);
 
 	return ns;
 
@@ -811,6 +1001,7 @@ void nsim_destroy(struct netdevsim *ns)
 	struct net_device *dev = ns->netdev;
 	struct netdevsim *peer;
 
+	debugfs_remove(ns->qr_dfs);
 	debugfs_remove(ns->pp_dfs);
 
 	rtnl_lock();
@@ -823,7 +1014,7 @@ void nsim_destroy(struct netdevsim *ns)
 		nsim_macsec_teardown(ns);
 		nsim_ipsec_teardown(ns);
 		nsim_bpf_uninit(ns);
-		nsim_queue_free(ns);
+		nsim_queue_uninit(ns);
 	}
 	rtnl_unlock();
 	if (nsim_dev_port_is_pf(ns->nsim_dev_port))
diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h
index bf02efa10956..a70f62af4c88 100644
--- a/drivers/net/netdevsim/netdevsim.h
+++ b/drivers/net/netdevsim/netdevsim.h
@@ -101,7 +101,9 @@ struct netdevsim {
 	struct nsim_dev *nsim_dev;
 	struct nsim_dev_port *nsim_dev_port;
 	struct mock_phc *phc;
-	struct nsim_rq *rq;
+	struct nsim_rq **rq;
+
+	int rq_reset_mode;
 
 	u64 tx_packets;
 	u64 tx_bytes;
@@ -134,6 +136,7 @@ struct netdevsim {
 
 	struct page *page;
 	struct dentry *pp_dfs;
+	struct dentry *qr_dfs;
 
 	struct nsim_ethtool ethtool;
 	struct netdevsim __rcu *peer;
diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c
index c1d881dc6409..1e1b00756be7 100644
--- a/drivers/net/netkit.c
+++ b/drivers/net/netkit.c
@@ -338,6 +338,7 @@ static int netkit_new_link(struct net *peer_net, struct net_device *dev,
 	enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT;
 	enum netkit_mode mode = NETKIT_L3;
 	unsigned char ifname_assign_type;
+	u16 headroom = 0, tailroom = 0;
 	struct ifinfomsg *ifmp = NULL;
 	struct net_device *peer;
 	char ifname[IFNAMSIZ];
@@ -371,6 +372,10 @@ static int netkit_new_link(struct net *peer_net, struct net_device *dev,
 			if (err < 0)
 				return err;
 		}
+		if (data[IFLA_NETKIT_HEADROOM])
+			headroom = nla_get_u16(data[IFLA_NETKIT_HEADROOM]);
+		if (data[IFLA_NETKIT_TAILROOM])
+			tailroom = nla_get_u16(data[IFLA_NETKIT_TAILROOM]);
 	}
 
 	if (ifmp && tbp[IFLA_IFNAME]) {
@@ -390,6 +395,14 @@ static int netkit_new_link(struct net *peer_net, struct net_device *dev,
 		return PTR_ERR(peer);
 
 	netif_inherit_tso_max(peer, dev);
+	if (headroom) {
+		peer->needed_headroom = headroom;
+		dev->needed_headroom = headroom;
+	}
+	if (tailroom) {
+		peer->needed_tailroom = tailroom;
+		dev->needed_tailroom = tailroom;
+	}
 
 	if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS]))
 		eth_hw_addr_random(peer);
@@ -401,6 +414,7 @@ static int netkit_new_link(struct net *peer_net, struct net_device *dev,
 	nk->policy = policy_peer;
 	nk->scrub = scrub_peer;
 	nk->mode = mode;
+	nk->headroom = headroom;
 	bpf_mprog_bundle_init(&nk->bundle);
 
 	err = register_netdevice(peer);
@@ -426,6 +440,7 @@ static int netkit_new_link(struct net *peer_net, struct net_device *dev,
 	nk->policy = policy_prim;
 	nk->scrub = scrub_prim;
 	nk->mode = mode;
+	nk->headroom = headroom;
 	bpf_mprog_bundle_init(&nk->bundle);
 
 	err = register_netdevice(dev);
@@ -850,7 +865,18 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
 	struct net_device *peer = rtnl_dereference(nk->peer);
 	enum netkit_action policy;
 	struct nlattr *attr;
-	int err;
+	int err, i;
+	static const struct {
+		u32 attr;
+		char *name;
+	} fixed_params[] = {
+		{ IFLA_NETKIT_MODE,       "operating mode" },
+		{ IFLA_NETKIT_SCRUB,      "scrubbing" },
+		{ IFLA_NETKIT_PEER_SCRUB, "peer scrubbing" },
+		{ IFLA_NETKIT_PEER_INFO,  "peer info" },
+		{ IFLA_NETKIT_HEADROOM,   "headroom" },
+		{ IFLA_NETKIT_TAILROOM,   "tailroom" },
+	};
 
 	if (!nk->primary) {
 		NL_SET_ERR_MSG(extack,
@@ -858,28 +884,14 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
 		return -EACCES;
 	}
 
-	if (data[IFLA_NETKIT_MODE]) {
-		NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_MODE],
-				    "netkit link operating mode cannot be changed after device creation");
-		return -EACCES;
-	}
-
-	if (data[IFLA_NETKIT_SCRUB]) {
-		NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_SCRUB],
-				    "netkit scrubbing cannot be changed after device creation");
-		return -EACCES;
-	}
-
-	if (data[IFLA_NETKIT_PEER_SCRUB]) {
-		NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_PEER_SCRUB],
-				    "netkit scrubbing cannot be changed after device creation");
-		return -EACCES;
-	}
-
-	if (data[IFLA_NETKIT_PEER_INFO]) {
-		NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_PEER_INFO],
-				    "netkit peer info cannot be changed after device creation");
-		return -EINVAL;
+	for (i = 0; i < ARRAY_SIZE(fixed_params); i++) {
+		attr = data[fixed_params[i].attr];
+		if (attr) {
+			NL_SET_ERR_MSG_ATTR_FMT(extack, attr,
+						"netkit link %s cannot be changed after device creation",
+						fixed_params[i].name);
+			return -EACCES;
+		}
 	}
 
 	if (data[IFLA_NETKIT_POLICY]) {
@@ -914,6 +926,8 @@ static size_t netkit_get_size(const struct net_device *dev)
 	       nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PEER_SCRUB */
 	       nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_MODE */
 	       nla_total_size(sizeof(u8))  + /* IFLA_NETKIT_PRIMARY */
+	       nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_HEADROOM */
+	       nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_TAILROOM */
 	       0;
 }
 
@@ -930,6 +944,10 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev)
 		return -EMSGSIZE;
 	if (nla_put_u32(skb, IFLA_NETKIT_SCRUB, nk->scrub))
 		return -EMSGSIZE;
+	if (nla_put_u16(skb, IFLA_NETKIT_HEADROOM, dev->needed_headroom))
+		return -EMSGSIZE;
+	if (nla_put_u16(skb, IFLA_NETKIT_TAILROOM, dev->needed_tailroom))
+		return -EMSGSIZE;
 
 	if (peer) {
 		nk = netkit_priv(peer);
@@ -947,6 +965,8 @@ static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = {
 	[IFLA_NETKIT_MODE]		= NLA_POLICY_MAX(NLA_U32, NETKIT_L3),
 	[IFLA_NETKIT_POLICY]		= { .type = NLA_U32 },
 	[IFLA_NETKIT_PEER_POLICY]	= { .type = NLA_U32 },
+	[IFLA_NETKIT_HEADROOM]		= { .type = NLA_U16 },
+	[IFLA_NETKIT_TAILROOM]		= { .type = NLA_U16 },
 	[IFLA_NETKIT_SCRUB]		= NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT),
 	[IFLA_NETKIT_PEER_SCRUB]	= NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT),
 	[IFLA_NETKIT_PRIMARY]		= { .type = NLA_REJECT,
diff --git a/drivers/net/pcs/pcs-lynx.c b/drivers/net/pcs/pcs-lynx.c
index 767a8c0714ac..6457190ec6e7 100644
--- a/drivers/net/pcs/pcs-lynx.c
+++ b/drivers/net/pcs/pcs-lynx.c
@@ -334,9 +334,19 @@ static const struct phylink_pcs_ops lynx_pcs_phylink_ops = {
 	.pcs_link_up = lynx_pcs_link_up,
 };
 
+static const phy_interface_t lynx_interfaces[] = {
+	PHY_INTERFACE_MODE_SGMII,
+	PHY_INTERFACE_MODE_QSGMII,
+	PHY_INTERFACE_MODE_1000BASEX,
+	PHY_INTERFACE_MODE_2500BASEX,
+	PHY_INTERFACE_MODE_10GBASER,
+	PHY_INTERFACE_MODE_USXGMII,
+};
+
 static struct phylink_pcs *lynx_pcs_create(struct mdio_device *mdio)
 {
 	struct lynx_pcs *lynx;
+	int i;
 
 	lynx = kzalloc(sizeof(*lynx), GFP_KERNEL);
 	if (!lynx)
@@ -348,6 +358,9 @@ static struct phylink_pcs *lynx_pcs_create(struct mdio_device *mdio)
 	lynx->pcs.neg_mode = true;
 	lynx->pcs.poll = true;
 
+	for (i = 0; i < ARRAY_SIZE(lynx_interfaces); i++)
+		__set_bit(lynx_interfaces[i], lynx->pcs.supported_interfaces);
+
 	return lynx_to_phylink_pcs(lynx);
 }
 
diff --git a/drivers/net/pcs/pcs-mtk-lynxi.c b/drivers/net/pcs/pcs-mtk-lynxi.c
index ed91cd7a406a..4fe0fb6d12a4 100644
--- a/drivers/net/pcs/pcs-mtk-lynxi.c
+++ b/drivers/net/pcs/pcs-mtk-lynxi.c
@@ -307,6 +307,10 @@ struct phylink_pcs *mtk_pcs_lynxi_create(struct device *dev,
 	mpcs->pcs.poll = true;
 	mpcs->interface = PHY_INTERFACE_MODE_NA;
 
+	__set_bit(PHY_INTERFACE_MODE_SGMII, mpcs->pcs.supported_interfaces);
+	__set_bit(PHY_INTERFACE_MODE_1000BASEX, mpcs->pcs.supported_interfaces);
+	__set_bit(PHY_INTERFACE_MODE_2500BASEX, mpcs->pcs.supported_interfaces);
+
 	return &mpcs->pcs;
 }
 EXPORT_SYMBOL(mtk_pcs_lynxi_create);
diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index f70ca39f0905..c06b66f40022 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -594,14 +594,13 @@ static unsigned int xpcs_inband_caps(struct phylink_pcs *pcs,
 	}
 }
 
-void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces)
+static void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces)
 {
 	const struct dw_xpcs_compat *compat;
 
 	for (compat = xpcs->desc->compat; compat->supported; compat++)
 		__set_bit(compat->interface, interfaces);
 }
-EXPORT_SYMBOL_GPL(xpcs_get_interfaces);
 
 int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable)
 {
@@ -1446,6 +1445,8 @@ static struct dw_xpcs *xpcs_create(struct mdio_device *mdiodev)
 	if (ret)
 		goto out_clear_clks;
 
+	xpcs_get_interfaces(xpcs, xpcs->pcs.supported_interfaces);
+
 	if (xpcs->info.pma == WX_TXGBE_XPCS_PMA_10G_ID)
 		xpcs->pcs.poll = false;
 	else
diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index dc625f2b3ae4..9ad3dbfd2f99 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -287,8 +287,8 @@ config MICROCHIP_PHY
 
 config MICROCHIP_T1_PHY
 	tristate "Microchip T1 PHYs"
-	select MICROCHIP_PHY_RDS_PTP if NETWORK_PHY_TIMESTAMPING && \
-				  PTP_1588_CLOCK_OPTIONAL
+	select MICROCHIP_PHY_RDS_PTP if NETWORK_PHY_TIMESTAMPING
+	depends on PTP_1588_CLOCK_OPTIONAL
 	help
 	  Supports the LAN8XXX PHYs.
 
diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c
index 334c17a68edd..4262bc31503b 100644
--- a/drivers/net/phy/dp83822.c
+++ b/drivers/net/phy/dp83822.c
@@ -30,6 +30,9 @@
 #define MII_DP83822_FCSCR	0x14
 #define MII_DP83822_RCSR	0x17
 #define MII_DP83822_RESET_CTRL	0x1f
+#define MII_DP83822_MLEDCR	0x25
+#define MII_DP83822_LEDCFG1	0x460
+#define MII_DP83822_IOCTRL1	0x462
 #define MII_DP83822_IOCTRL2	0x463
 #define MII_DP83822_GENCFG	0x465
 #define MII_DP83822_SOR1	0x467
@@ -105,10 +108,26 @@
 #define DP83822_RX_CLK_SHIFT	BIT(12)
 #define DP83822_TX_CLK_SHIFT	BIT(11)
 
+/* MLEDCR bits */
+#define DP83822_MLEDCR_CFG		GENMASK(6, 3)
+#define DP83822_MLEDCR_ROUTE		GENMASK(1, 0)
+#define DP83822_MLEDCR_ROUTE_LED_0	DP83822_MLEDCR_ROUTE
+
+/* LEDCFG1 bits */
+#define DP83822_LEDCFG1_LED1_CTRL	GENMASK(11, 8)
+#define DP83822_LEDCFG1_LED3_CTRL	GENMASK(7, 4)
+
+/* IOCTRL1 bits */
+#define DP83822_IOCTRL1_GPIO3_CTRL		GENMASK(10, 8)
+#define DP83822_IOCTRL1_GPIO3_CTRL_LED3		BIT(0)
+#define DP83822_IOCTRL1_GPIO1_CTRL		GENMASK(2, 0)
+#define DP83822_IOCTRL1_GPIO1_CTRL_LED_1	BIT(0)
+
 /* IOCTRL2 bits */
 #define DP83822_IOCTRL2_GPIO2_CLK_SRC		GENMASK(6, 4)
 #define DP83822_IOCTRL2_GPIO2_CTRL		GENMASK(2, 0)
 #define DP83822_IOCTRL2_GPIO2_CTRL_CLK_REF	GENMASK(1, 0)
+#define DP83822_IOCTRL2_GPIO2_CTRL_MLED		BIT(0)
 
 #define DP83822_CLK_SRC_MAC_IF			0x0
 #define DP83822_CLK_SRC_XI			0x1
@@ -117,6 +136,22 @@
 #define DP83822_CLK_SRC_FREE_RUNNING		0x6
 #define DP83822_CLK_SRC_RECOVERED		0x7
 
+#define DP83822_LED_FN_LINK		0x0 /* Link established */
+#define DP83822_LED_FN_RX_TX		0x1 /* Receive or Transmit activity */
+#define DP83822_LED_FN_TX		0x2 /* Transmit activity */
+#define DP83822_LED_FN_RX		0x3 /* Receive activity */
+#define DP83822_LED_FN_COLLISION	0x4 /* Collision detected */
+#define DP83822_LED_FN_LINK_100_BTX	0x5 /* 100 BTX link established */
+#define DP83822_LED_FN_LINK_10_BT	0x6 /* 10BT link established */
+#define DP83822_LED_FN_FULL_DUPLEX	0x7 /* Full duplex */
+#define DP83822_LED_FN_LINK_RX_TX	0x8 /* Link established, blink for rx or tx activity */
+#define DP83822_LED_FN_ACTIVE_STRETCH	0x9 /* Active Stretch Signal */
+#define DP83822_LED_FN_MII_LINK		0xa /* MII LINK (100BT+FD) */
+#define DP83822_LED_FN_LPI_MODE		0xb /* LPI Mode (EEE) */
+#define DP83822_LED_FN_RX_TX_ERR	0xc /* TX/RX MII Error */
+#define DP83822_LED_FN_LINK_LOST	0xd /* Link Lost */
+#define DP83822_LED_FN_PRBS_ERR		0xe /* Blink for PRBS error */
+
 /* SOR1 mode */
 #define DP83822_STRAP_MODE1	0
 #define DP83822_STRAP_MODE2	BIT(0)
@@ -145,6 +180,13 @@
 					ADVERTISED_FIBRE | \
 					ADVERTISED_Pause | ADVERTISED_Asym_Pause)
 
+#define DP83822_MAX_LED_PINS		4
+
+#define DP83822_LED_INDEX_LED_0		0
+#define DP83822_LED_INDEX_LED_1_GPIO1	1
+#define DP83822_LED_INDEX_COL_GPIO2	2
+#define DP83822_LED_INDEX_RX_D3_GPIO3	3
+
 struct dp83822_private {
 	bool fx_signal_det_low;
 	int fx_enabled;
@@ -154,6 +196,7 @@ struct dp83822_private {
 	struct ethtool_wolinfo wol;
 	bool set_gpio2_clk_out;
 	u32 gpio2_clk_out;
+	bool led_pin_enable[DP83822_MAX_LED_PINS];
 };
 
 static int dp83822_config_wol(struct phy_device *phydev,
@@ -418,6 +461,48 @@ static int dp83822_read_status(struct phy_device *phydev)
 	return 0;
 }
 
+static int dp83822_config_init_leds(struct phy_device *phydev)
+{
+	struct dp83822_private *dp83822 = phydev->priv;
+	int ret;
+
+	if (dp83822->led_pin_enable[DP83822_LED_INDEX_LED_0]) {
+		ret = phy_modify_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_MLEDCR,
+				     DP83822_MLEDCR_ROUTE,
+				     FIELD_PREP(DP83822_MLEDCR_ROUTE,
+						DP83822_MLEDCR_ROUTE_LED_0));
+		if (ret)
+			return ret;
+	} else if (dp83822->led_pin_enable[DP83822_LED_INDEX_COL_GPIO2]) {
+		ret = phy_modify_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_IOCTRL2,
+				     DP83822_IOCTRL2_GPIO2_CTRL,
+				     FIELD_PREP(DP83822_IOCTRL2_GPIO2_CTRL,
+						DP83822_IOCTRL2_GPIO2_CTRL_MLED));
+		if (ret)
+			return ret;
+	}
+
+	if (dp83822->led_pin_enable[DP83822_LED_INDEX_LED_1_GPIO1]) {
+		ret = phy_modify_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_IOCTRL1,
+				     DP83822_IOCTRL1_GPIO1_CTRL,
+				     FIELD_PREP(DP83822_IOCTRL1_GPIO1_CTRL,
+						DP83822_IOCTRL1_GPIO1_CTRL_LED_1));
+		if (ret)
+			return ret;
+	}
+
+	if (dp83822->led_pin_enable[DP83822_LED_INDEX_RX_D3_GPIO3]) {
+		ret = phy_modify_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_IOCTRL1,
+				     DP83822_IOCTRL1_GPIO3_CTRL,
+				     FIELD_PREP(DP83822_IOCTRL1_GPIO3_CTRL,
+						DP83822_IOCTRL1_GPIO3_CTRL_LED3));
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 static int dp83822_config_init(struct phy_device *phydev)
 {
 	struct dp83822_private *dp83822 = phydev->priv;
@@ -437,6 +522,10 @@ static int dp83822_config_init(struct phy_device *phydev)
 			       FIELD_PREP(DP83822_IOCTRL2_GPIO2_CLK_SRC,
 					  dp83822->gpio2_clk_out));
 
+	err = dp83822_config_init_leds(phydev);
+	if (err)
+		return err;
+
 	if (phy_interface_is_rgmii(phydev)) {
 		rx_int_delay = phy_get_internal_delay(phydev, dev, NULL, 0,
 						      true);
@@ -631,6 +720,61 @@ static int dp83822_phy_reset(struct phy_device *phydev)
 }
 
 #ifdef CONFIG_OF_MDIO
+static int dp83822_of_init_leds(struct phy_device *phydev)
+{
+	struct device_node *node = phydev->mdio.dev.of_node;
+	struct dp83822_private *dp83822 = phydev->priv;
+	struct device_node *leds;
+	u32 index;
+	int err;
+
+	if (!node)
+		return 0;
+
+	leds = of_get_child_by_name(node, "leds");
+	if (!leds)
+		return 0;
+
+	for_each_available_child_of_node_scoped(leds, led) {
+		err = of_property_read_u32(led, "reg", &index);
+		if (err) {
+			of_node_put(leds);
+			return err;
+		}
+
+		if (index <= DP83822_LED_INDEX_RX_D3_GPIO3) {
+			dp83822->led_pin_enable[index] = true;
+		} else {
+			of_node_put(leds);
+			return -EINVAL;
+		}
+	}
+
+	of_node_put(leds);
+	/* LED_0 and COL(GPIO2) use the MLED function. MLED can be routed to
+	 * only one of these two pins at a time.
+	 */
+	if (dp83822->led_pin_enable[DP83822_LED_INDEX_LED_0] &&
+	    dp83822->led_pin_enable[DP83822_LED_INDEX_COL_GPIO2]) {
+		phydev_err(phydev, "LED_0 and COL(GPIO2) cannot be used as LED output at the same time\n");
+		return -EINVAL;
+	}
+
+	if (dp83822->led_pin_enable[DP83822_LED_INDEX_COL_GPIO2] &&
+	    dp83822->set_gpio2_clk_out) {
+		phydev_err(phydev, "COL(GPIO2) cannot be used as LED outout, already used as clock output\n");
+		return -EINVAL;
+	}
+
+	if (dp83822->led_pin_enable[DP83822_LED_INDEX_RX_D3_GPIO3] &&
+	    phydev->interface != PHY_INTERFACE_MODE_RMII) {
+		phydev_err(phydev, "RX_D3 can only be used as LED output when in RMII mode\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int dp83822_of_init(struct phy_device *phydev)
 {
 	struct dp83822_private *dp83822 = phydev->priv;
@@ -671,7 +815,7 @@ static int dp83822_of_init(struct phy_device *phydev)
 		dp83822->set_gpio2_clk_out = true;
 	}
 
-	return 0;
+	return dp83822_of_init_leds(phydev);
 }
 
 static int dp83826_to_dac_minus_one_regval(int percent)
@@ -769,7 +913,9 @@ static int dp83822_probe(struct phy_device *phydev)
 	if (ret)
 		return ret;
 
-	dp83822_of_init(phydev);
+	ret = dp83822_of_init(phydev);
+	if (ret)
+		return ret;
 
 	if (dp83822->fx_enabled)
 		phydev->port = PORT_FIBRE;
@@ -816,6 +962,130 @@ static int dp83822_resume(struct phy_device *phydev)
 	return 0;
 }
 
+static int dp83822_led_mode(u8 index, unsigned long rules)
+{
+	switch (rules) {
+	case BIT(TRIGGER_NETDEV_LINK):
+		return DP83822_LED_FN_LINK;
+	case BIT(TRIGGER_NETDEV_LINK_10):
+		return DP83822_LED_FN_LINK_10_BT;
+	case BIT(TRIGGER_NETDEV_LINK_100):
+		return DP83822_LED_FN_LINK_100_BTX;
+	case BIT(TRIGGER_NETDEV_FULL_DUPLEX):
+		return DP83822_LED_FN_FULL_DUPLEX;
+	case BIT(TRIGGER_NETDEV_TX):
+		return DP83822_LED_FN_TX;
+	case BIT(TRIGGER_NETDEV_RX):
+		return DP83822_LED_FN_RX;
+	case BIT(TRIGGER_NETDEV_TX) | BIT(TRIGGER_NETDEV_RX):
+		return DP83822_LED_FN_RX_TX;
+	case BIT(TRIGGER_NETDEV_TX_ERR) | BIT(TRIGGER_NETDEV_RX_ERR):
+		return DP83822_LED_FN_RX_TX_ERR;
+	case BIT(TRIGGER_NETDEV_LINK) | BIT(TRIGGER_NETDEV_TX) | BIT(TRIGGER_NETDEV_RX):
+		return DP83822_LED_FN_LINK_RX_TX;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int dp83822_led_hw_is_supported(struct phy_device *phydev, u8 index,
+				       unsigned long rules)
+{
+	int mode;
+
+	mode = dp83822_led_mode(index, rules);
+	if (mode < 0)
+		return mode;
+
+	return 0;
+}
+
+static int dp83822_led_hw_control_set(struct phy_device *phydev, u8 index,
+				      unsigned long rules)
+{
+	int mode;
+
+	mode = dp83822_led_mode(index, rules);
+	if (mode < 0)
+		return mode;
+
+	if (index == DP83822_LED_INDEX_LED_0 || index == DP83822_LED_INDEX_COL_GPIO2)
+		return phy_modify_mmd(phydev, MDIO_MMD_VEND2,
+				      MII_DP83822_MLEDCR, DP83822_MLEDCR_CFG,
+				      FIELD_PREP(DP83822_MLEDCR_CFG, mode));
+	else if (index == DP83822_LED_INDEX_LED_1_GPIO1)
+		return phy_modify_mmd(phydev, MDIO_MMD_VEND2,
+				      MII_DP83822_LEDCFG1,
+				      DP83822_LEDCFG1_LED1_CTRL,
+				      FIELD_PREP(DP83822_LEDCFG1_LED1_CTRL,
+						 mode));
+	else
+		return phy_modify_mmd(phydev, MDIO_MMD_VEND2,
+				      MII_DP83822_LEDCFG1,
+				      DP83822_LEDCFG1_LED3_CTRL,
+				      FIELD_PREP(DP83822_LEDCFG1_LED3_CTRL,
+						 mode));
+}
+
+static int dp83822_led_hw_control_get(struct phy_device *phydev, u8 index,
+				      unsigned long *rules)
+{
+	int val;
+
+	if (index == DP83822_LED_INDEX_LED_0 || index == DP83822_LED_INDEX_COL_GPIO2) {
+		val = phy_read_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_MLEDCR);
+		if (val < 0)
+			return val;
+
+		val = FIELD_GET(DP83822_MLEDCR_CFG, val);
+	} else {
+		val = phy_read_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_LEDCFG1);
+		if (val < 0)
+			return val;
+
+		if (index == DP83822_LED_INDEX_LED_1_GPIO1)
+			val = FIELD_GET(DP83822_LEDCFG1_LED1_CTRL, val);
+		else
+			val = FIELD_GET(DP83822_LEDCFG1_LED3_CTRL, val);
+	}
+
+	switch (val) {
+	case DP83822_LED_FN_LINK:
+		*rules = BIT(TRIGGER_NETDEV_LINK);
+		break;
+	case DP83822_LED_FN_LINK_10_BT:
+		*rules = BIT(TRIGGER_NETDEV_LINK_10);
+		break;
+	case DP83822_LED_FN_LINK_100_BTX:
+		*rules = BIT(TRIGGER_NETDEV_LINK_100);
+		break;
+	case DP83822_LED_FN_FULL_DUPLEX:
+		*rules = BIT(TRIGGER_NETDEV_FULL_DUPLEX);
+		break;
+	case DP83822_LED_FN_TX:
+		*rules = BIT(TRIGGER_NETDEV_TX);
+		break;
+	case DP83822_LED_FN_RX:
+		*rules = BIT(TRIGGER_NETDEV_RX);
+		break;
+	case DP83822_LED_FN_RX_TX:
+		*rules = BIT(TRIGGER_NETDEV_TX) | BIT(TRIGGER_NETDEV_RX);
+		break;
+	case DP83822_LED_FN_RX_TX_ERR:
+		*rules = BIT(TRIGGER_NETDEV_TX_ERR) | BIT(TRIGGER_NETDEV_RX_ERR);
+		break;
+	case DP83822_LED_FN_LINK_RX_TX:
+		*rules = BIT(TRIGGER_NETDEV_LINK) | BIT(TRIGGER_NETDEV_TX) |
+			 BIT(TRIGGER_NETDEV_RX);
+		break;
+	default:
+		*rules = 0;
+		break;
+	}
+
+	return 0;
+}
+
 #define DP83822_PHY_DRIVER(_id, _name)				\
 	{							\
 		PHY_ID_MATCH_MODEL(_id),			\
@@ -831,6 +1101,9 @@ static int dp83822_resume(struct phy_device *phydev)
 		.handle_interrupt = dp83822_handle_interrupt,	\
 		.suspend = dp83822_suspend,			\
 		.resume = dp83822_resume,			\
+		.led_hw_is_supported = dp83822_led_hw_is_supported,	\
+		.led_hw_control_set = dp83822_led_hw_control_set,	\
+		.led_hw_control_get = dp83822_led_hw_control_get,	\
 	}
 
 #define DP83825_PHY_DRIVER(_id, _name)				\
diff --git a/drivers/net/phy/dp83td510.c b/drivers/net/phy/dp83td510.c
index 92aa3a2b9744..1abeacca7441 100644
--- a/drivers/net/phy/dp83td510.c
+++ b/drivers/net/phy/dp83td510.c
@@ -34,6 +34,29 @@
 #define DP83TD510E_CTRL_HW_RESET		BIT(15)
 #define DP83TD510E_CTRL_SW_RESET		BIT(14)
 
+/*
+ * DP83TD510E_PKT_STAT_x registers correspond to similarly named registers
+ * in the datasheet (PKT_STAT_1 through PKT_STAT_6). These registers store
+ * 32-bit or 16-bit counters for TX and RX statistics and must be read in
+ * sequence to ensure the counters are cleared correctly.
+ *
+ * - DP83TD510E_PKT_STAT_1: Contains TX packet count bits [15:0].
+ * - DP83TD510E_PKT_STAT_2: Contains TX packet count bits [31:16].
+ * - DP83TD510E_PKT_STAT_3: Contains TX error packet count.
+ * - DP83TD510E_PKT_STAT_4: Contains RX packet count bits [15:0].
+ * - DP83TD510E_PKT_STAT_5: Contains RX packet count bits [31:16].
+ * - DP83TD510E_PKT_STAT_6: Contains RX error packet count.
+ *
+ * Keeping the register names as defined in the datasheet helps maintain
+ * clarity and alignment with the documentation.
+ */
+#define DP83TD510E_PKT_STAT_1			0x12b
+#define DP83TD510E_PKT_STAT_2			0x12c
+#define DP83TD510E_PKT_STAT_3			0x12d
+#define DP83TD510E_PKT_STAT_4			0x12e
+#define DP83TD510E_PKT_STAT_5			0x12f
+#define DP83TD510E_PKT_STAT_6			0x130
+
 #define DP83TD510E_AN_STAT_1			0x60c
 #define DP83TD510E_MASTER_SLAVE_RESOL_FAIL	BIT(15)
 
@@ -58,8 +81,16 @@ static const u16 dp83td510_mse_sqi_map[] = {
 	0x0000  /* 24dB =< SNR */
 };
 
+struct dp83td510_stats {
+	u64 tx_pkt_cnt;
+	u64 tx_err_pkt_cnt;
+	u64 rx_pkt_cnt;
+	u64 rx_err_pkt_cnt;
+};
+
 struct dp83td510_priv {
 	bool alcd_test_active;
+	struct dp83td510_stats stats;
 };
 
 /* Time Domain Reflectometry (TDR) Functionality of DP83TD510 PHY
@@ -177,6 +208,85 @@ struct dp83td510_priv {
 #define DP83TD510E_ALCD_COMPLETE			BIT(15)
 #define DP83TD510E_ALCD_CABLE_LENGTH			GENMASK(10, 0)
 
+/**
+ * dp83td510_update_stats - Update the PHY statistics for the DP83TD510 PHY.
+ * @phydev: Pointer to the phy_device structure.
+ *
+ * The function reads the PHY statistics registers and updates the statistics
+ * structure.
+ *
+ * Returns: 0 on success or a negative error code on failure.
+ */
+static int dp83td510_update_stats(struct phy_device *phydev)
+{
+	struct dp83td510_priv *priv = phydev->priv;
+	u32 count;
+	int ret;
+
+	/* The DP83TD510E_PKT_STAT registers are divided into two groups:
+	 * - Group 1 (TX stats): DP83TD510E_PKT_STAT_1 to DP83TD510E_PKT_STAT_3
+	 * - Group 2 (RX stats): DP83TD510E_PKT_STAT_4 to DP83TD510E_PKT_STAT_6
+	 *
+	 * Registers in each group are cleared only after reading them in a
+	 * plain sequence (e.g., 1, 2, 3 for Group 1 or 4, 5, 6 for Group 2).
+	 * Any deviation from the sequence, such as reading 1, 2, 1, 2, 3, will
+	 * prevent the group from being cleared. Additionally, the counters
+	 * for a group are frozen as soon as the first register in that group
+	 * is accessed.
+	 */
+	ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TD510E_PKT_STAT_1);
+	if (ret < 0)
+		return ret;
+	/* tx_pkt_cnt_15_0 */
+	count = ret;
+
+	ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TD510E_PKT_STAT_2);
+	if (ret < 0)
+		return ret;
+	/* tx_pkt_cnt_31_16 */
+	count |= ret << 16;
+	priv->stats.tx_pkt_cnt += count;
+
+	ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TD510E_PKT_STAT_3);
+	if (ret < 0)
+		return ret;
+	/* tx_err_pkt_cnt */
+	priv->stats.tx_err_pkt_cnt += ret;
+
+	ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TD510E_PKT_STAT_4);
+	if (ret < 0)
+		return ret;
+	/* rx_pkt_cnt_15_0 */
+	count = ret;
+
+	ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TD510E_PKT_STAT_5);
+	if (ret < 0)
+		return ret;
+	/* rx_pkt_cnt_31_16 */
+	count |= ret << 16;
+	priv->stats.rx_pkt_cnt += count;
+
+	ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TD510E_PKT_STAT_6);
+	if (ret < 0)
+		return ret;
+	/* rx_err_pkt_cnt */
+	priv->stats.rx_err_pkt_cnt += ret;
+
+	return 0;
+}
+
+static void dp83td510_get_phy_stats(struct phy_device *phydev,
+				    struct ethtool_eth_phy_stats *eth_stats,
+				    struct ethtool_phy_stats *stats)
+{
+	struct dp83td510_priv *priv = phydev->priv;
+
+	stats->tx_packets = priv->stats.tx_pkt_cnt;
+	stats->tx_errors = priv->stats.tx_err_pkt_cnt;
+	stats->rx_packets = priv->stats.rx_pkt_cnt;
+	stats->rx_errors = priv->stats.rx_err_pkt_cnt;
+}
+
 static int dp83td510_config_intr(struct phy_device *phydev)
 {
 	int ret;
@@ -599,6 +709,8 @@ static struct phy_driver dp83td510_driver[] = {
 	.get_sqi_max	= dp83td510_get_sqi_max,
 	.cable_test_start = dp83td510_cable_test_start,
 	.cable_test_get_status = dp83td510_cable_test_get_status,
+	.get_phy_stats	= dp83td510_get_phy_stats,
+	.update_stats	= dp83td510_update_stats,
 
 	.suspend	= genphy_suspend,
 	.resume		= genphy_resume,
diff --git a/drivers/net/phy/dp83tg720.c b/drivers/net/phy/dp83tg720.c
index 0ef4d7dba065..4ea752131b8c 100644
--- a/drivers/net/phy/dp83tg720.c
+++ b/drivers/net/phy/dp83tg720.c
@@ -51,6 +51,9 @@
 /* Register 0x0405: Unknown Register */
 #define DP83TG720S_UNKNOWN_0405			0x405
 
+#define DP83TG720S_LINK_QUAL_3			0x547
+#define DP83TG720S_LINK_LOSS_CNT_MASK		GENMASK(15, 10)
+
 /* Register 0x0576: TDR Master Link Down Control */
 #define DP83TG720S_TDR_MASTER_LINK_DOWN		0x576
 
@@ -60,6 +63,29 @@
 /* In RGMII mode, Enable or disable the internal delay for TXD */
 #define DP83TG720S_RGMII_TX_CLK_SEL		BIT(0)
 
+/*
+ * DP83TG720S_PKT_STAT_x registers correspond to similarly named registers
+ * in the datasheet (PKT_STAT_1 through PKT_STAT_6). These registers store
+ * 32-bit or 16-bit counters for TX and RX statistics and must be read in
+ * sequence to ensure the counters are cleared correctly.
+ *
+ * - DP83TG720S_PKT_STAT_1: Contains TX packet count bits [15:0].
+ * - DP83TG720S_PKT_STAT_2: Contains TX packet count bits [31:16].
+ * - DP83TG720S_PKT_STAT_3: Contains TX error packet count.
+ * - DP83TG720S_PKT_STAT_4: Contains RX packet count bits [15:0].
+ * - DP83TG720S_PKT_STAT_5: Contains RX packet count bits [31:16].
+ * - DP83TG720S_PKT_STAT_6: Contains RX error packet count.
+ *
+ * Keeping the register names as defined in the datasheet helps maintain
+ * clarity and alignment with the documentation.
+ */
+#define DP83TG720S_PKT_STAT_1			0x639
+#define DP83TG720S_PKT_STAT_2			0x63a
+#define DP83TG720S_PKT_STAT_3			0x63b
+#define DP83TG720S_PKT_STAT_4			0x63c
+#define DP83TG720S_PKT_STAT_5			0x63d
+#define DP83TG720S_PKT_STAT_6			0x63e
+
 /* Register 0x083F: Unknown Register */
 #define DP83TG720S_UNKNOWN_083F			0x83f
 
@@ -69,6 +95,113 @@
 
 #define DP83TG720_SQI_MAX			7
 
+struct dp83tg720_stats {
+	u64 link_loss_cnt;
+	u64 tx_pkt_cnt;
+	u64 tx_err_pkt_cnt;
+	u64 rx_pkt_cnt;
+	u64 rx_err_pkt_cnt;
+};
+
+struct dp83tg720_priv {
+	struct dp83tg720_stats stats;
+};
+
+/**
+ * dp83tg720_update_stats - Update the PHY statistics for the DP83TD510 PHY.
+ * @phydev: Pointer to the phy_device structure.
+ *
+ * The function reads the PHY statistics registers and updates the statistics
+ * structure.
+ *
+ * Returns: 0 on success or a negative error code on failure.
+ */
+static int dp83tg720_update_stats(struct phy_device *phydev)
+{
+	struct dp83tg720_priv *priv = phydev->priv;
+	u32 count;
+	int ret;
+
+	/* Read the link loss count */
+	ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TG720S_LINK_QUAL_3);
+	if (ret < 0)
+		return ret;
+	/* link_loss_cnt */
+	count = FIELD_GET(DP83TG720S_LINK_LOSS_CNT_MASK, ret);
+	priv->stats.link_loss_cnt += count;
+
+	/* The DP83TG720S_PKT_STAT registers are divided into two groups:
+	 * - Group 1 (TX stats): DP83TG720S_PKT_STAT_1 to DP83TG720S_PKT_STAT_3
+	 * - Group 2 (RX stats): DP83TG720S_PKT_STAT_4 to DP83TG720S_PKT_STAT_6
+	 *
+	 * Registers in each group are cleared only after reading them in a
+	 * plain sequence (e.g., 1, 2, 3 for Group 1 or 4, 5, 6 for Group 2).
+	 * Any deviation from the sequence, such as reading 1, 2, 1, 2, 3, will
+	 * prevent the group from being cleared. Additionally, the counters
+	 * for a group are frozen as soon as the first register in that group
+	 * is accessed.
+	 */
+	ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TG720S_PKT_STAT_1);
+	if (ret < 0)
+		return ret;
+	/* tx_pkt_cnt_15_0 */
+	count = ret;
+
+	ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TG720S_PKT_STAT_2);
+	if (ret < 0)
+		return ret;
+	/* tx_pkt_cnt_31_16 */
+	count |= ret << 16;
+	priv->stats.tx_pkt_cnt += count;
+
+	ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TG720S_PKT_STAT_3);
+	if (ret < 0)
+		return ret;
+	/* tx_err_pkt_cnt */
+	priv->stats.tx_err_pkt_cnt += ret;
+
+	ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TG720S_PKT_STAT_4);
+	if (ret < 0)
+		return ret;
+	/* rx_pkt_cnt_15_0 */
+	count = ret;
+
+	ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TG720S_PKT_STAT_5);
+	if (ret < 0)
+		return ret;
+	/* rx_pkt_cnt_31_16 */
+	count |= ret << 16;
+	priv->stats.rx_pkt_cnt += count;
+
+	ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, DP83TG720S_PKT_STAT_6);
+	if (ret < 0)
+		return ret;
+	/* rx_err_pkt_cnt */
+	priv->stats.rx_err_pkt_cnt += ret;
+
+	return 0;
+}
+
+static void dp83tg720_get_link_stats(struct phy_device *phydev,
+				     struct ethtool_link_ext_stats *link_stats)
+{
+	struct dp83tg720_priv *priv = phydev->priv;
+
+	link_stats->link_down_events = priv->stats.link_loss_cnt;
+}
+
+static void dp83tg720_get_phy_stats(struct phy_device *phydev,
+				    struct ethtool_eth_phy_stats *eth_stats,
+				    struct ethtool_phy_stats *stats)
+{
+	struct dp83tg720_priv *priv = phydev->priv;
+
+	stats->tx_packets = priv->stats.tx_pkt_cnt;
+	stats->tx_errors = priv->stats.tx_err_pkt_cnt;
+	stats->rx_packets = priv->stats.rx_pkt_cnt;
+	stats->rx_errors = priv->stats.rx_err_pkt_cnt;
+}
+
 /**
  * dp83tg720_cable_test_start - Start the cable test for the DP83TG720 PHY.
  * @phydev: Pointer to the phy_device structure.
@@ -182,6 +315,11 @@ static int dp83tg720_cable_test_get_status(struct phy_device *phydev,
 
 	ethnl_cable_test_result(phydev, ETHTOOL_A_CABLE_PAIR_A, stat);
 
+	/* save the current stats before resetting the PHY */
+	ret = dp83tg720_update_stats(phydev);
+	if (ret)
+		return ret;
+
 	return phy_init_hw(phydev);
 }
 
@@ -217,6 +355,11 @@ static int dp83tg720_read_status(struct phy_device *phydev)
 	phy_sts = phy_read(phydev, DP83TG720S_MII_REG_10);
 	phydev->link = !!(phy_sts & DP83TG720S_LINK_STATUS);
 	if (!phydev->link) {
+		/* save the current stats before resetting the PHY */
+		ret = dp83tg720_update_stats(phydev);
+		if (ret)
+			return ret;
+
 		/* According to the "DP83TC81x, DP83TG72x Software
 		 * Implementation Guide", the PHY needs to be reset after a
 		 * link loss or if no link is created after at least 100ms.
@@ -341,12 +484,27 @@ static int dp83tg720_config_init(struct phy_device *phydev)
 	return genphy_c45_pma_baset1_read_master_slave(phydev);
 }
 
+static int dp83tg720_probe(struct phy_device *phydev)
+{
+	struct device *dev = &phydev->mdio.dev;
+	struct dp83tg720_priv *priv;
+
+	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	phydev->priv = priv;
+
+	return 0;
+}
+
 static struct phy_driver dp83tg720_driver[] = {
 {
 	PHY_ID_MATCH_MODEL(DP83TG720S_PHY_ID),
 	.name		= "TI DP83TG720S",
 
 	.flags          = PHY_POLL_CABLE_TEST,
+	.probe		= dp83tg720_probe,
 	.config_aneg	= dp83tg720_config_aneg,
 	.read_status	= dp83tg720_read_status,
 	.get_features	= genphy_c45_pma_read_ext_abilities,
@@ -355,6 +513,9 @@ static struct phy_driver dp83tg720_driver[] = {
 	.get_sqi_max	= dp83tg720_get_sqi_max,
 	.cable_test_start = dp83tg720_cable_test_start,
 	.cable_test_get_status = dp83tg720_cable_test_get_status,
+	.get_link_stats	= dp83tg720_get_link_stats,
+	.get_phy_stats	= dp83tg720_get_phy_stats,
+	.update_stats	= dp83tg720_update_stats,
 
 	.suspend	= genphy_suspend,
 	.resume		= genphy_resume,
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index eeb33eb181ac..0fd1cb676cd5 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -2006,7 +2006,7 @@ static int ksz9477_config_init(struct phy_device *phydev)
 	 * in this switch shall be regarded as broken.
 	 */
 	if (phydev->dev_flags & MICREL_NO_EEE)
-		linkmode_fill(phydev->eee_broken_modes);
+		phy_disable_eee(phydev);
 
 	return kszphy_config_init(phydev);
 }
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index e4b04cdaa995..c008fe050245 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -616,6 +616,49 @@ int phy_ethtool_get_stats(struct phy_device *phydev,
 EXPORT_SYMBOL(phy_ethtool_get_stats);
 
 /**
+ * __phy_ethtool_get_phy_stats - Retrieve standardized PHY statistics
+ * @phydev: Pointer to the PHY device
+ * @phy_stats: Pointer to ethtool_eth_phy_stats structure
+ * @phydev_stats: Pointer to ethtool_phy_stats structure
+ *
+ * Fetches PHY statistics using a kernel-defined interface for consistent
+ * diagnostics. Unlike phy_ethtool_get_stats(), which allows custom stats,
+ * this function enforces a standardized format for better interoperability.
+ */
+void __phy_ethtool_get_phy_stats(struct phy_device *phydev,
+				 struct ethtool_eth_phy_stats *phy_stats,
+				 struct ethtool_phy_stats *phydev_stats)
+{
+	if (!phydev->drv || !phydev->drv->get_phy_stats)
+		return;
+
+	mutex_lock(&phydev->lock);
+	phydev->drv->get_phy_stats(phydev, phy_stats, phydev_stats);
+	mutex_unlock(&phydev->lock);
+}
+
+/**
+ * __phy_ethtool_get_link_ext_stats - Retrieve extended link statistics for a PHY
+ * @phydev: Pointer to the PHY device
+ * @link_stats: Pointer to the structure to store extended link statistics
+ *
+ * Populates the ethtool_link_ext_stats structure with link down event counts
+ * and additional driver-specific link statistics, if available.
+ */
+void __phy_ethtool_get_link_ext_stats(struct phy_device *phydev,
+				      struct ethtool_link_ext_stats *link_stats)
+{
+	link_stats->link_down_events = READ_ONCE(phydev->link_down_events);
+
+	if (!phydev->drv || !phydev->drv->get_link_stats)
+		return;
+
+	mutex_lock(&phydev->lock);
+	phydev->drv->get_link_stats(phydev, link_stats);
+	mutex_unlock(&phydev->lock);
+}
+
+/**
  * phy_ethtool_get_plca_cfg - Get PLCA RS configuration
  * @phydev: the phy_device struct
  * @plca_cfg: where to store the retrieved configuration
@@ -1399,6 +1442,23 @@ static int phy_enable_interrupts(struct phy_device *phydev)
 }
 
 /**
+ * phy_update_stats - Update PHY device statistics if supported.
+ * @phydev: Pointer to the PHY device structure.
+ *
+ * If the PHY driver provides an update_stats callback, this function
+ * invokes it to update the PHY statistics. If not, it returns 0.
+ *
+ * Return: 0 on success, or a negative error code if the callback fails.
+ */
+static int phy_update_stats(struct phy_device *phydev)
+{
+	if (!phydev->drv->update_stats)
+		return 0;
+
+	return phydev->drv->update_stats(phydev);
+}
+
+/**
  * phy_request_interrupt - request and enable interrupt for a PHY device
  * @phydev: target phy_device struct
  *
@@ -1467,6 +1527,9 @@ static enum phy_state_work _phy_state_machine(struct phy_device *phydev)
 	case PHY_RUNNING:
 		err = phy_check_link_status(phydev);
 		func = &phy_check_link_status;
+
+		if (!err)
+			err = phy_update_stats(phydev);
 		break;
 	case PHY_CABLETEST:
 		err = phydev->drv->cable_test_get_status(phydev, &finished);
@@ -1641,6 +1704,27 @@ void phy_mac_interrupt(struct phy_device *phydev)
 EXPORT_SYMBOL(phy_mac_interrupt);
 
 /**
+ * phy_eee_rx_clock_stop() - configure PHY receive clock in LPI
+ * @phydev: target phy_device struct
+ * @clk_stop_enable: flag to indicate whether the clock can be stopped
+ *
+ * Configure whether the PHY can disable its receive clock during LPI mode,
+ * See IEEE 802.3 sections 22.2.2.2, 35.2.2.10, and 45.2.3.1.4.
+ *
+ * Returns: 0 or negative error.
+ */
+int phy_eee_rx_clock_stop(struct phy_device *phydev, bool clk_stop_enable)
+{
+	/* Configure the PHY to stop receiving xMII
+	 * clock while it is signaling LPI.
+	 */
+	return phy_modify_mmd(phydev, MDIO_MMD_PCS, MDIO_CTRL1,
+			      MDIO_PCS_CTRL1_CLKSTOP_EN,
+			      clk_stop_enable ? MDIO_PCS_CTRL1_CLKSTOP_EN : 0);
+}
+EXPORT_SYMBOL_GPL(phy_eee_rx_clock_stop);
+
+/**
  * phy_init_eee - init and check the EEE feature
  * @phydev: target phy_device struct
  * @clk_stop_enable: PHY may stop the clock during LPI
@@ -1664,11 +1748,7 @@ int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable)
 		return -EPROTONOSUPPORT;
 
 	if (clk_stop_enable)
-		/* Configure the PHY to stop receiving xMII
-		 * clock while it is signaling LPI.
-		 */
-		ret = phy_set_bits_mmd(phydev, MDIO_MMD_PCS, MDIO_CTRL1,
-				       MDIO_PCS_CTRL1_CLKSTOP_EN);
+		ret = phy_eee_rx_clock_stop(phydev, true);
 
 	return ret < 0 ? ret : 0;
 }
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index bdc997f59779..5b34d39d1d52 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -3800,6 +3800,8 @@ static const struct ethtool_phy_ops phy_ethtool_phy_ops = {
 static const struct phylib_stubs __phylib_stubs = {
 	.hwtstamp_get = __phy_hwtstamp_get,
 	.hwtstamp_set = __phy_hwtstamp_set,
+	.get_phy_stats = __phy_ethtool_get_phy_stats,
+	.get_link_ext_stats = __phy_ethtool_get_link_ext_stats,
 };
 
 static void phylib_register_stubs(void)
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 6d50c2fdb190..31754d5fd659 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -691,6 +691,17 @@ static int phylink_validate_mac_and_pcs(struct phylink *pl,
 			return -EINVAL;
 		}
 
+		/* Ensure that this PCS supports the interface which the MAC
+		 * returned it for. It is an error for the MAC to return a PCS
+		 * that does not support the interface mode.
+		 */
+		if (!phy_interface_empty(pcs->supported_interfaces) &&
+		    !test_bit(state->interface, pcs->supported_interfaces)) {
+			phylink_err(pl, "MAC returned PCS which does not support %s\n",
+				    phy_modes(state->interface));
+			return -EINVAL;
+		}
+
 		/* Validate the link parameters with the PCS */
 		if (pcs->ops->pcs_validate) {
 			ret = pcs->ops->pcs_validate(pcs, supported, state);
diff --git a/drivers/nfc/st21nfca/dep.c b/drivers/nfc/st21nfca/dep.c
index 1ec651e31064..3425b68f0ddc 100644
--- a/drivers/nfc/st21nfca/dep.c
+++ b/drivers/nfc/st21nfca/dep.c
@@ -116,18 +116,16 @@ static void st21nfca_tx_work(struct work_struct *work)
 	struct nfc_dev *dev;
 	struct sk_buff *skb;
 
-	if (info) {
-		dev = info->hdev->ndev;
-		skb = info->dep_info.tx_pending;
+	dev = info->hdev->ndev;
+	skb = info->dep_info.tx_pending;
 
-		device_lock(&dev->dev);
+	device_lock(&dev->dev);
 
-		nfc_hci_send_cmd_async(info->hdev, ST21NFCA_RF_READER_F_GATE,
-				ST21NFCA_WR_XCHG_DATA, skb->data, skb->len,
-				info->async_cb, info);
-		device_unlock(&dev->dev);
-		kfree_skb(skb);
-	}
+	nfc_hci_send_cmd_async(info->hdev, ST21NFCA_RF_READER_F_GATE,
+			ST21NFCA_WR_XCHG_DATA, skb->data, skb->len,
+			info->async_cb, info);
+	device_unlock(&dev->dev);
+	kfree_skb(skb);
 }
 
 static void st21nfca_im_send_pdu(struct st21nfca_hci_info *info,
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 611b02c8a8b3..c4bb8dfe1a45 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -173,6 +173,11 @@ enum nvme_quirks {
 	 * MSI (but not MSI-X) interrupts are broken and never fire.
 	 */
 	NVME_QUIRK_BROKEN_MSI			= (1 << 21),
+
+	/*
+	 * Align dma pool segment size to 512 bytes
+	 */
+	NVME_QUIRK_DMAPOOL_ALIGN_512		= (1 << 22),
 };
 
 /*
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 1a5ba80f1811..e2634f437f33 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2834,15 +2834,20 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
 
 static int nvme_setup_prp_pools(struct nvme_dev *dev)
 {
+	size_t small_align = 256;
+
 	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
 						NVME_CTRL_PAGE_SIZE,
 						NVME_CTRL_PAGE_SIZE, 0);
 	if (!dev->prp_page_pool)
 		return -ENOMEM;
 
+	if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512)
+		small_align = 512;
+
 	/* Optimisation for I/Os between 4k and 128k */
 	dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
-						256, 256, 0);
+						256, small_align, 0);
 	if (!dev->prp_small_pool) {
 		dma_pool_destroy(dev->prp_page_pool);
 		return -ENOMEM;
@@ -3607,7 +3612,7 @@ static const struct pci_device_id nvme_id_table[] = {
 	{ PCI_VDEVICE(REDHAT, 0x0010),	/* Qemu emulated controller */
 		.driver_data = NVME_QUIRK_BOGUS_NID, },
 	{ PCI_DEVICE(0x1217, 0x8760), /* O2 Micro 64GB Steam Deck */
-		.driver_data = NVME_QUIRK_QDEPTH_ONE },
+		.driver_data = NVME_QUIRK_DMAPOOL_ALIGN_512, },
 	{ PCI_DEVICE(0x126f, 0x2262),	/* Silicon Motion generic */
 		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
 				NVME_QUIRK_BOGUS_NID, },
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 28c76a3e1bd2..b127d41dbbfe 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2024,14 +2024,6 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
 	return __nvme_tcp_alloc_io_queues(ctrl);
 }
 
-static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
-{
-	nvme_tcp_stop_io_queues(ctrl);
-	if (remove)
-		nvme_remove_io_tag_set(ctrl);
-	nvme_tcp_free_io_queues(ctrl);
-}
-
 static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
 {
 	int ret, nr_queues;
@@ -2176,9 +2168,11 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
 	nvme_sync_io_queues(ctrl);
 	nvme_tcp_stop_io_queues(ctrl);
 	nvme_cancel_tagset(ctrl);
-	if (remove)
+	if (remove) {
 		nvme_unquiesce_io_queues(ctrl);
-	nvme_tcp_destroy_io_queues(ctrl, remove);
+		nvme_remove_io_tag_set(ctrl);
+	}
+	nvme_tcp_free_io_queues(ctrl);
 }
 
 static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl,
@@ -2267,7 +2261,9 @@ destroy_io:
 		nvme_sync_io_queues(ctrl);
 		nvme_tcp_stop_io_queues(ctrl);
 		nvme_cancel_tagset(ctrl);
-		nvme_tcp_destroy_io_queues(ctrl, new);
+		if (new)
+			nvme_remove_io_tag_set(ctrl);
+		nvme_tcp_free_io_queues(ctrl);
 	}
 destroy_admin:
 	nvme_stop_keep_alive(ctrl);
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 2962794ce881..fa89b0549c36 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -139,7 +139,7 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
 	unsigned long idx;
 
 	ctrl = req->sq->ctrl;
-	xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
+	nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) {
 		/* we don't have the right data for file backed ns */
 		if (!ns->bdev)
 			continue;
@@ -331,9 +331,10 @@ static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid,
 	u32 count = 0;
 
 	if (!(req->cmd->get_log_page.lsp & NVME_ANA_LOG_RGO)) {
-		xa_for_each(&ctrl->subsys->namespaces, idx, ns)
+		nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) {
 			if (ns->anagrpid == grpid)
 				desc->nsids[count++] = cpu_to_le32(ns->nsid);
+		}
 	}
 
 	desc->grpid = cpu_to_le32(grpid);
@@ -772,7 +773,7 @@ static void nvmet_execute_identify_endgrp_list(struct nvmet_req *req)
 		goto out;
 	}
 
-	xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
+	nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) {
 		if (ns->nsid <= min_endgid)
 			continue;
 
@@ -815,7 +816,7 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req, bool match_css)
 		goto out;
 	}
 
-	xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
+	nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) {
 		if (ns->nsid <= min_nsid)
 			continue;
 		if (match_css && req->ns->csi != req->cmd->identify.csi)
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index eeee9e9b854c..2b030f0efc38 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -810,18 +810,6 @@ static struct configfs_attribute *nvmet_ns_attrs[] = {
 	NULL,
 };
 
-bool nvmet_subsys_nsid_exists(struct nvmet_subsys *subsys, u32 nsid)
-{
-	struct config_item *ns_item;
-	char name[12];
-
-	snprintf(name, sizeof(name), "%u", nsid);
-	mutex_lock(&subsys->namespaces_group.cg_subsys->su_mutex);
-	ns_item = config_group_find_item(&subsys->namespaces_group, name);
-	mutex_unlock(&subsys->namespaces_group.cg_subsys->su_mutex);
-	return ns_item != NULL;
-}
-
 static void nvmet_ns_release(struct config_item *item)
 {
 	struct nvmet_ns *ns = to_nvmet_ns(item);
@@ -2254,12 +2242,17 @@ static ssize_t nvmet_root_discovery_nqn_store(struct config_item *item,
 		const char *page, size_t count)
 {
 	struct list_head *entry;
+	char *old_nqn, *new_nqn;
 	size_t len;
 
 	len = strcspn(page, "\n");
 	if (!len || len > NVMF_NQN_FIELD_LEN - 1)
 		return -EINVAL;
 
+	new_nqn = kstrndup(page, len, GFP_KERNEL);
+	if (!new_nqn)
+		return -ENOMEM;
+
 	down_write(&nvmet_config_sem);
 	list_for_each(entry, &nvmet_subsystems_group.cg_children) {
 		struct config_item *item =
@@ -2268,13 +2261,15 @@ static ssize_t nvmet_root_discovery_nqn_store(struct config_item *item,
 		if (!strncmp(config_item_name(item), page, len)) {
 			pr_err("duplicate NQN %s\n", config_item_name(item));
 			up_write(&nvmet_config_sem);
+			kfree(new_nqn);
 			return -EINVAL;
 		}
 	}
-	memset(nvmet_disc_subsys->subsysnqn, 0, NVMF_NQN_FIELD_LEN);
-	memcpy(nvmet_disc_subsys->subsysnqn, page, len);
+	old_nqn = nvmet_disc_subsys->subsysnqn;
+	nvmet_disc_subsys->subsysnqn = new_nqn;
 	up_write(&nvmet_config_sem);
 
+	kfree(old_nqn);
 	return len;
 }
 
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 1f4e9989663b..fde6c555af61 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -127,7 +127,7 @@ static u32 nvmet_max_nsid(struct nvmet_subsys *subsys)
 	unsigned long idx;
 	u32 nsid = 0;
 
-	xa_for_each(&subsys->namespaces, idx, cur)
+	nvmet_for_each_enabled_ns(&subsys->namespaces, idx, cur)
 		nsid = cur->nsid;
 
 	return nsid;
@@ -441,11 +441,14 @@ u16 nvmet_req_find_ns(struct nvmet_req *req)
 	struct nvmet_subsys *subsys = nvmet_req_subsys(req);
 
 	req->ns = xa_load(&subsys->namespaces, nsid);
-	if (unlikely(!req->ns)) {
+	if (unlikely(!req->ns || !req->ns->enabled)) {
 		req->error_loc = offsetof(struct nvme_common_command, nsid);
-		if (nvmet_subsys_nsid_exists(subsys, nsid))
-			return NVME_SC_INTERNAL_PATH_ERROR;
-		return NVME_SC_INVALID_NS | NVME_STATUS_DNR;
+		if (!req->ns) /* ns doesn't exist! */
+			return NVME_SC_INVALID_NS | NVME_STATUS_DNR;
+
+		/* ns exists but it's disabled */
+		req->ns = NULL;
+		return NVME_SC_INTERNAL_PATH_ERROR;
 	}
 
 	percpu_ref_get(&req->ns->ref);
@@ -583,8 +586,6 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
 		goto out_unlock;
 
 	ret = -EMFILE;
-	if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES)
-		goto out_unlock;
 
 	ret = nvmet_bdev_ns_enable(ns);
 	if (ret == -ENOTBLK)
@@ -599,38 +600,19 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
 		nvmet_p2pmem_ns_add_p2p(ctrl, ns);
 
-	ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace,
-				0, GFP_KERNEL);
-	if (ret)
-		goto out_dev_put;
-
-	if (ns->nsid > subsys->max_nsid)
-		subsys->max_nsid = ns->nsid;
-
-	ret = xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL);
-	if (ret)
-		goto out_restore_subsys_maxnsid;
-
 	if (ns->pr.enable) {
 		ret = nvmet_pr_init_ns(ns);
 		if (ret)
-			goto out_remove_from_subsys;
+			goto out_dev_put;
 	}
 
-	subsys->nr_namespaces++;
-
 	nvmet_ns_changed(subsys, ns->nsid);
 	ns->enabled = true;
+	xa_set_mark(&subsys->namespaces, ns->nsid, NVMET_NS_ENABLED);
 	ret = 0;
 out_unlock:
 	mutex_unlock(&subsys->lock);
 	return ret;
-
-out_remove_from_subsys:
-	xa_erase(&subsys->namespaces, ns->nsid);
-out_restore_subsys_maxnsid:
-	subsys->max_nsid = nvmet_max_nsid(subsys);
-	percpu_ref_exit(&ns->ref);
 out_dev_put:
 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
 		pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
@@ -649,15 +631,37 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
 		goto out_unlock;
 
 	ns->enabled = false;
-	xa_erase(&ns->subsys->namespaces, ns->nsid);
-	if (ns->nsid == subsys->max_nsid)
-		subsys->max_nsid = nvmet_max_nsid(subsys);
+	xa_clear_mark(&subsys->namespaces, ns->nsid, NVMET_NS_ENABLED);
 
 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
 		pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
 
 	mutex_unlock(&subsys->lock);
 
+	if (ns->pr.enable)
+		nvmet_pr_exit_ns(ns);
+
+	mutex_lock(&subsys->lock);
+	nvmet_ns_changed(subsys, ns->nsid);
+	nvmet_ns_dev_disable(ns);
+out_unlock:
+	mutex_unlock(&subsys->lock);
+}
+
+void nvmet_ns_free(struct nvmet_ns *ns)
+{
+	struct nvmet_subsys *subsys = ns->subsys;
+
+	nvmet_ns_disable(ns);
+
+	mutex_lock(&subsys->lock);
+
+	xa_erase(&subsys->namespaces, ns->nsid);
+	if (ns->nsid == subsys->max_nsid)
+		subsys->max_nsid = nvmet_max_nsid(subsys);
+
+	mutex_unlock(&subsys->lock);
+
 	/*
 	 * Now that we removed the namespaces from the lookup list, we
 	 * can kill the per_cpu ref and wait for any remaining references
@@ -671,21 +675,9 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
 	wait_for_completion(&ns->disable_done);
 	percpu_ref_exit(&ns->ref);
 
-	if (ns->pr.enable)
-		nvmet_pr_exit_ns(ns);
-
 	mutex_lock(&subsys->lock);
-
 	subsys->nr_namespaces--;
-	nvmet_ns_changed(subsys, ns->nsid);
-	nvmet_ns_dev_disable(ns);
-out_unlock:
 	mutex_unlock(&subsys->lock);
-}
-
-void nvmet_ns_free(struct nvmet_ns *ns)
-{
-	nvmet_ns_disable(ns);
 
 	down_write(&nvmet_ana_sem);
 	nvmet_ana_group_enabled[ns->anagrpid]--;
@@ -699,15 +691,33 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
 {
 	struct nvmet_ns *ns;
 
+	mutex_lock(&subsys->lock);
+
+	if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES)
+		goto out_unlock;
+
 	ns = kzalloc(sizeof(*ns), GFP_KERNEL);
 	if (!ns)
-		return NULL;
+		goto out_unlock;
 
 	init_completion(&ns->disable_done);
 
 	ns->nsid = nsid;
 	ns->subsys = subsys;
 
+	if (percpu_ref_init(&ns->ref, nvmet_destroy_namespace, 0, GFP_KERNEL))
+		goto out_free;
+
+	if (ns->nsid > subsys->max_nsid)
+		subsys->max_nsid = nsid;
+
+	if (xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL))
+		goto out_exit;
+
+	subsys->nr_namespaces++;
+
+	mutex_unlock(&subsys->lock);
+
 	down_write(&nvmet_ana_sem);
 	ns->anagrpid = NVMET_DEFAULT_ANA_GRPID;
 	nvmet_ana_group_enabled[ns->anagrpid]++;
@@ -718,6 +728,14 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
 	ns->csi = NVME_CSI_NVM;
 
 	return ns;
+out_exit:
+	subsys->max_nsid = nvmet_max_nsid(subsys);
+	percpu_ref_exit(&ns->ref);
+out_free:
+	kfree(ns);
+out_unlock:
+	mutex_unlock(&subsys->lock);
+	return NULL;
 }
 
 static void nvmet_update_sq_head(struct nvmet_req *req)
@@ -1394,7 +1412,7 @@ static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl,
 
 	ctrl->p2p_client = get_device(req->p2p_client);
 
-	xa_for_each(&ctrl->subsys->namespaces, idx, ns)
+	nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns)
 		nvmet_p2pmem_ns_add_p2p(ctrl, ns);
 }
 
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 0bda83d0fc3e..eaf31c823cbe 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -36,7 +36,7 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id)
 	 */
 	id->nsfeat |= 1 << 4;
 	/* NPWG = Namespace Preferred Write Granularity. 0's based */
-	id->npwg = lpp0b;
+	id->npwg = to0based(bdev_io_min(bdev) / bdev_logical_block_size(bdev));
 	/* NPWA = Namespace Preferred Write Alignment. 0's based */
 	id->npwa = id->npwg;
 	/* NPDG = Namespace Preferred Deallocate Granularity. 0's based */
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 58328b35dc96..7233549f7c8a 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -24,6 +24,7 @@
 
 #define NVMET_DEFAULT_VS		NVME_VS(2, 1, 0)
 
+#define NVMET_NS_ENABLED		XA_MARK_1
 #define NVMET_ASYNC_EVENTS		4
 #define NVMET_ERROR_LOG_SLOTS		128
 #define NVMET_NO_ERROR_LOC		((u16)-1)
@@ -33,6 +34,12 @@
 #define NVMET_FR_MAX_SIZE		8
 #define NVMET_PR_LOG_QUEUE_SIZE		64
 
+#define nvmet_for_each_ns(xa, index, entry) \
+	xa_for_each(xa, index, entry)
+
+#define nvmet_for_each_enabled_ns(xa, index, entry) \
+	xa_for_each_marked(xa, index, entry, NVMET_NS_ENABLED)
+
 /*
  * Supported optional AENs:
  */
diff --git a/drivers/nvme/target/pr.c b/drivers/nvme/target/pr.c
index 90e9f5bbe581..cd22d8333314 100644
--- a/drivers/nvme/target/pr.c
+++ b/drivers/nvme/target/pr.c
@@ -60,7 +60,7 @@ u16 nvmet_set_feat_resv_notif_mask(struct nvmet_req *req, u32 mask)
 		goto success;
 	}
 
-	xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
+	nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) {
 		if (ns->pr.enable)
 			WRITE_ONCE(ns->pr.notify_mask, mask);
 	}
@@ -1056,7 +1056,7 @@ int nvmet_ctrl_init_pr(struct nvmet_ctrl *ctrl)
 	 * nvmet_pr_init_ns(), see more details in nvmet_ns_enable().
 	 * So just check ns->pr.enable.
 	 */
-	xa_for_each(&subsys->namespaces, idx, ns) {
+	nvmet_for_each_enabled_ns(&subsys->namespaces, idx, ns) {
 		if (ns->pr.enable) {
 			ret = nvmet_pr_alloc_and_insert_pc_ref(ns, ctrl->cntlid,
 							&ctrl->hostid);
@@ -1067,7 +1067,7 @@ int nvmet_ctrl_init_pr(struct nvmet_ctrl *ctrl)
 	return 0;
 
 free_per_ctrl_refs:
-	xa_for_each(&subsys->namespaces, idx, ns) {
+	nvmet_for_each_enabled_ns(&subsys->namespaces, idx, ns) {
 		if (ns->pr.enable) {
 			pc_ref = xa_erase(&ns->pr_per_ctrl_refs, ctrl->cntlid);
 			if (pc_ref)
@@ -1087,7 +1087,7 @@ void nvmet_ctrl_destroy_pr(struct nvmet_ctrl *ctrl)
 	kfifo_free(&ctrl->pr_log_mgr.log_queue);
 	mutex_destroy(&ctrl->pr_log_mgr.lock);
 
-	xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
+	nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) {
 		if (ns->pr.enable) {
 			pc_ref = xa_erase(&ns->pr_per_ctrl_refs, ctrl->cntlid);
 			if (pc_ref)
diff --git a/drivers/ufs/core/ufshcd-priv.h b/drivers/ufs/core/ufshcd-priv.h
index 9ffd94ddf8c7..786f20ef2238 100644
--- a/drivers/ufs/core/ufshcd-priv.h
+++ b/drivers/ufs/core/ufshcd-priv.h
@@ -237,12 +237,6 @@ static inline void ufshcd_vops_config_scaling_param(struct ufs_hba *hba,
 		hba->vops->config_scaling_param(hba, p, data);
 }
 
-static inline void ufshcd_vops_reinit_notify(struct ufs_hba *hba)
-{
-	if (hba->vops && hba->vops->reinit_notify)
-		hba->vops->reinit_notify(hba);
-}
-
 static inline int ufshcd_vops_mcq_config_resource(struct ufs_hba *hba)
 {
 	if (hba->vops && hba->vops->mcq_config_resource)
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 8a01e4393159..9c26e8767515 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -8858,7 +8858,6 @@ static int ufshcd_probe_hba(struct ufs_hba *hba, bool init_dev_params)
 		ufshcd_device_reset(hba);
 		ufs_put_device_desc(hba);
 		ufshcd_hba_stop(hba);
-		ufshcd_vops_reinit_notify(hba);
 		ret = ufshcd_hba_enable(hba);
 		if (ret) {
 			dev_err(hba->dev, "Host controller enable failed\n");
@@ -10591,14 +10590,17 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
 	}
 
 	/*
-	 * Set the default power management level for runtime and system PM.
+	 * Set the default power management level for runtime and system PM if
+	 * not set by the host controller drivers.
 	 * Default power saving mode is to keep UFS link in Hibern8 state
 	 * and UFS device in sleep state.
 	 */
-	hba->rpm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state(
+	if (!hba->rpm_lvl)
+		hba->rpm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state(
 						UFS_SLEEP_PWR_MODE,
 						UIC_LINK_HIBERN8_STATE);
-	hba->spm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state(
+	if (!hba->spm_lvl)
+		hba->spm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state(
 						UFS_SLEEP_PWR_MODE,
 						UIC_LINK_HIBERN8_STATE);
 
diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c
index 68040b2ab5f8..91e94fe990b4 100644
--- a/drivers/ufs/host/ufs-qcom.c
+++ b/drivers/ufs/host/ufs-qcom.c
@@ -368,6 +368,11 @@ static int ufs_qcom_power_up_sequence(struct ufs_hba *hba)
 	if (ret)
 		return ret;
 
+	if (phy->power_count) {
+		phy_power_off(phy);
+		phy_exit(phy);
+	}
+
 	/* phy initialization - calibrate the phy */
 	ret = phy_init(phy);
 	if (ret) {
@@ -866,6 +871,7 @@ static u32 ufs_qcom_get_ufs_hci_version(struct ufs_hba *hba)
  */
 static void ufs_qcom_advertise_quirks(struct ufs_hba *hba)
 {
+	const struct ufs_qcom_drvdata *drvdata = of_device_get_match_data(hba->dev);
 	struct ufs_qcom_host *host = ufshcd_get_variant(hba);
 
 	if (host->hw_ver.major == 0x2)
@@ -874,9 +880,8 @@ static void ufs_qcom_advertise_quirks(struct ufs_hba *hba)
 	if (host->hw_ver.major > 0x3)
 		hba->quirks |= UFSHCD_QUIRK_REINIT_AFTER_MAX_GEAR_SWITCH;
 
-	if (of_device_is_compatible(hba->dev->of_node, "qcom,sm8550-ufshc") ||
-	    of_device_is_compatible(hba->dev->of_node, "qcom,sm8650-ufshc"))
-		hba->quirks |= UFSHCD_QUIRK_BROKEN_LSDBS_CAP;
+	if (drvdata && drvdata->quirks)
+		hba->quirks |= drvdata->quirks;
 }
 
 static void ufs_qcom_set_phy_gear(struct ufs_qcom_host *host)
@@ -1064,6 +1069,7 @@ static int ufs_qcom_init(struct ufs_hba *hba)
 	struct device *dev = hba->dev;
 	struct ufs_qcom_host *host;
 	struct ufs_clk_info *clki;
+	const struct ufs_qcom_drvdata *drvdata = of_device_get_match_data(hba->dev);
 
 	host = devm_kzalloc(dev, sizeof(*host), GFP_KERNEL);
 	if (!host)
@@ -1143,6 +1149,9 @@ static int ufs_qcom_init(struct ufs_hba *hba)
 		dev_warn(dev, "%s: failed to configure the testbus %d\n",
 				__func__, err);
 
+	if (drvdata && drvdata->no_phy_retention)
+		hba->spm_lvl = UFS_PM_LVL_5;
+
 	return 0;
 
 out_variant_clear:
@@ -1579,13 +1588,6 @@ static void ufs_qcom_config_scaling_param(struct ufs_hba *hba,
 }
 #endif
 
-static void ufs_qcom_reinit_notify(struct ufs_hba *hba)
-{
-	struct ufs_qcom_host *host = ufshcd_get_variant(hba);
-
-	phy_power_off(host->generic_phy);
-}
-
 /* Resources */
 static const struct ufshcd_res_info ufs_res_info[RES_MAX] = {
 	{.name = "ufs_mem",},
@@ -1825,7 +1827,6 @@ static const struct ufs_hba_variant_ops ufs_hba_qcom_vops = {
 	.device_reset		= ufs_qcom_device_reset,
 	.config_scaling_param = ufs_qcom_config_scaling_param,
 	.program_key		= ufs_qcom_ice_program_key,
-	.reinit_notify		= ufs_qcom_reinit_notify,
 	.mcq_config_resource	= ufs_qcom_mcq_config_resource,
 	.get_hba_mac		= ufs_qcom_get_hba_mac,
 	.op_runtime_config	= ufs_qcom_op_runtime_config,
@@ -1868,9 +1869,15 @@ static void ufs_qcom_remove(struct platform_device *pdev)
 		platform_device_msi_free_irqs_all(hba->dev);
 }
 
+static const struct ufs_qcom_drvdata ufs_qcom_sm8550_drvdata = {
+	.quirks = UFSHCD_QUIRK_BROKEN_LSDBS_CAP,
+	.no_phy_retention = true,
+};
+
 static const struct of_device_id ufs_qcom_of_match[] __maybe_unused = {
 	{ .compatible = "qcom,ufshc" },
-	{ .compatible = "qcom,sm8550-ufshc" },
+	{ .compatible = "qcom,sm8550-ufshc", .data = &ufs_qcom_sm8550_drvdata },
+	{ .compatible = "qcom,sm8650-ufshc", .data = &ufs_qcom_sm8550_drvdata },
 	{},
 };
 MODULE_DEVICE_TABLE(of, ufs_qcom_of_match);
diff --git a/drivers/ufs/host/ufs-qcom.h b/drivers/ufs/host/ufs-qcom.h
index b9de170983c9..919f53682beb 100644
--- a/drivers/ufs/host/ufs-qcom.h
+++ b/drivers/ufs/host/ufs-qcom.h
@@ -217,6 +217,11 @@ struct ufs_qcom_host {
 	bool esi_enabled;
 };
 
+struct ufs_qcom_drvdata {
+	enum ufshcd_quirks quirks;
+	bool no_phy_retention;
+};
+
 static inline u32
 ufs_qcom_get_debug_reg_offset(struct ufs_qcom_host *host, u32 reg)
 {
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 1ab58da9f38a..1a4ed5a357d3 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1661,14 +1661,15 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
 	unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff;
 	vm_fault_t ret = VM_FAULT_SIGBUS;
 
-	if (order && (vmf->address & ((PAGE_SIZE << order) - 1) ||
+	pfn = vma_to_pfn(vma) + pgoff;
+
+	if (order && (pfn & ((1 << order) - 1) ||
+		      vmf->address & ((PAGE_SIZE << order) - 1) ||
 		      vmf->address + (PAGE_SIZE << order) > vma->vm_end)) {
 		ret = VM_FAULT_FALLBACK;
 		goto out;
 	}
 
-	pfn = vma_to_pfn(vma);
-
 	down_read(&vdev->memory_lock);
 
 	if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev))
@@ -1676,18 +1677,18 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
 
 	switch (order) {
 	case 0:
-		ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff);
+		ret = vmf_insert_pfn(vma, vmf->address, pfn);
 		break;
 #ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
 	case PMD_ORDER:
-		ret = vmf_insert_pfn_pmd(vmf, __pfn_to_pfn_t(pfn + pgoff,
-							     PFN_DEV), false);
+		ret = vmf_insert_pfn_pmd(vmf,
+					 __pfn_to_pfn_t(pfn, PFN_DEV), false);
 		break;
 #endif
 #ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
 	case PUD_ORDER:
-		ret = vmf_insert_pfn_pud(vmf, __pfn_to_pfn_t(pfn + pgoff,
-							     PFN_DEV), false);
+		ret = vmf_insert_pfn_pud(vmf,
+					 __pfn_to_pfn_t(pfn, PFN_DEV), false);
 		break;
 #endif
 	default:
diff --git a/drivers/watchdog/stm32_iwdg.c b/drivers/watchdog/stm32_iwdg.c
index d700e0d49bb9..8ad06b54c5ad 100644
--- a/drivers/watchdog/stm32_iwdg.c
+++ b/drivers/watchdog/stm32_iwdg.c
@@ -286,7 +286,7 @@ static int stm32_iwdg_irq_init(struct platform_device *pdev,
 	if (!wdt->data->has_early_wakeup)
 		return 0;
 
-	irq = platform_get_irq(pdev, 0);
+	irq = platform_get_irq_optional(pdev, 0);
 	if (irq <= 0)
 		return 0;
 
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 819c75233235..3bc9ce6c575e 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -57,6 +57,8 @@ static void v9fs_issue_write(struct netfs_io_subrequest *subreq)
 	int err, len;
 
 	len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err);
+	if (len > 0)
+		__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
 	netfs_write_subrequest_terminated(subreq, len ?: err, false);
 }
 
@@ -80,8 +82,10 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
 	if (pos + total >= i_size_read(rreq->inode))
 		__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
 
-	if (!err)
+	if (!err) {
 		subreq->transferred += total;
+		__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
+	}
 
 	netfs_read_subreq_terminated(subreq, err, false);
 }
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 34107b55f834..ccb6aa8027c5 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -122,7 +122,7 @@ static void afs_issue_write_worker(struct work_struct *work)
 	if (subreq->debug_index == 3)
 		return netfs_write_subrequest_terminated(subreq, -ENOANO, false);
 
-	if (!test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) {
+	if (!subreq->retry_count) {
 		set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
 		return netfs_write_subrequest_terminated(subreq, -EAGAIN, false);
 	}
@@ -149,6 +149,9 @@ static void afs_issue_write_worker(struct work_struct *work)
 	afs_wait_for_operation(op);
 	ret = afs_put_operation(op);
 	switch (ret) {
+	case 0:
+		__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
+		break;
 	case -EACCES:
 	case -EPERM:
 	case -ENOKEY:
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3af8bb0c8d75..4d9305fa37a8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4878,25 +4878,29 @@ out_fail:
 	return ret;
 }
 
+struct btrfs_uring_encoded_data {
+	struct btrfs_ioctl_encoded_io_args args;
+	struct iovec iovstack[UIO_FASTIOV];
+	struct iovec *iov;
+	struct iov_iter iter;
+};
+
 static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags)
 {
 	size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags);
 	size_t copy_end;
-	struct btrfs_ioctl_encoded_io_args args = { 0 };
 	int ret;
 	u64 disk_bytenr, disk_io_size;
 	struct file *file;
 	struct btrfs_inode *inode;
 	struct btrfs_fs_info *fs_info;
 	struct extent_io_tree *io_tree;
-	struct iovec iovstack[UIO_FASTIOV];
-	struct iovec *iov = iovstack;
-	struct iov_iter iter;
 	loff_t pos;
 	struct kiocb kiocb;
 	struct extent_state *cached_state = NULL;
 	u64 start, lockend;
 	void __user *sqe_addr;
+	struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data;
 
 	if (!capable(CAP_SYS_ADMIN)) {
 		ret = -EPERM;
@@ -4910,43 +4914,64 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
 
 	if (issue_flags & IO_URING_F_COMPAT) {
 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
-		struct btrfs_ioctl_encoded_io_args_32 args32;
-
 		copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags);
-		if (copy_from_user(&args32, sqe_addr, copy_end)) {
-			ret = -EFAULT;
-			goto out_acct;
-		}
-		args.iov = compat_ptr(args32.iov);
-		args.iovcnt = args32.iovcnt;
-		args.offset = args32.offset;
-		args.flags = args32.flags;
 #else
 		return -ENOTTY;
 #endif
 	} else {
 		copy_end = copy_end_kernel;
-		if (copy_from_user(&args, sqe_addr, copy_end)) {
-			ret = -EFAULT;
+	}
+
+	if (!data) {
+		data = kzalloc(sizeof(*data), GFP_NOFS);
+		if (!data) {
+			ret = -ENOMEM;
 			goto out_acct;
 		}
-	}
 
-	if (args.flags != 0)
-		return -EINVAL;
+		io_uring_cmd_get_async_data(cmd)->op_data = data;
 
-	ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
-			   &iov, &iter);
-	if (ret < 0)
-		goto out_acct;
+		if (issue_flags & IO_URING_F_COMPAT) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+			struct btrfs_ioctl_encoded_io_args_32 args32;
 
-	if (iov_iter_count(&iter) == 0) {
-		ret = 0;
-		goto out_free;
+			if (copy_from_user(&args32, sqe_addr, copy_end)) {
+				ret = -EFAULT;
+				goto out_acct;
+			}
+
+			data->args.iov = compat_ptr(args32.iov);
+			data->args.iovcnt = args32.iovcnt;
+			data->args.offset = args32.offset;
+			data->args.flags = args32.flags;
+#endif
+		} else {
+			if (copy_from_user(&data->args, sqe_addr, copy_end)) {
+				ret = -EFAULT;
+				goto out_acct;
+			}
+		}
+
+		if (data->args.flags != 0) {
+			ret = -EINVAL;
+			goto out_acct;
+		}
+
+		data->iov = data->iovstack;
+		ret = import_iovec(ITER_DEST, data->args.iov, data->args.iovcnt,
+				   ARRAY_SIZE(data->iovstack), &data->iov,
+				   &data->iter);
+		if (ret < 0)
+			goto out_acct;
+
+		if (iov_iter_count(&data->iter) == 0) {
+			ret = 0;
+			goto out_free;
+		}
 	}
 
-	pos = args.offset;
-	ret = rw_verify_area(READ, file, &pos, args.len);
+	pos = data->args.offset;
+	ret = rw_verify_area(READ, file, &pos, data->args.len);
 	if (ret < 0)
 		goto out_free;
 
@@ -4959,15 +4984,16 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
 	start = ALIGN_DOWN(pos, fs_info->sectorsize);
 	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
 
-	ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
+	ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state,
 				 &disk_bytenr, &disk_io_size);
 	if (ret < 0 && ret != -EIOCBQUEUED)
 		goto out_free;
 
 	file_accessed(file);
 
-	if (copy_to_user(sqe_addr + copy_end, (const char *)&args + copy_end_kernel,
-			 sizeof(args) - copy_end_kernel)) {
+	if (copy_to_user(sqe_addr + copy_end,
+			 (const char *)&data->args + copy_end_kernel,
+			 sizeof(data->args) - copy_end_kernel)) {
 		if (ret == -EIOCBQUEUED) {
 			unlock_extent(io_tree, start, lockend, &cached_state);
 			btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
@@ -4977,40 +5003,22 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
 	}
 
 	if (ret == -EIOCBQUEUED) {
-		u64 count;
-
-		/*
-		 * If we've optimized things by storing the iovecs on the stack,
-		 * undo this.
-		 */
-		if (!iov) {
-			iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS);
-			if (!iov) {
-				unlock_extent(io_tree, start, lockend, &cached_state);
-				btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
-				ret = -ENOMEM;
-				goto out_acct;
-			}
-
-			memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt);
-		}
-
-		count = min_t(u64, iov_iter_count(&iter), disk_io_size);
+		u64 count = min_t(u64, iov_iter_count(&data->iter), disk_io_size);
 
 		/* Match ioctl by not returning past EOF if uncompressed. */
-		if (!args.compression)
-			count = min_t(u64, count, args.len);
+		if (!data->args.compression)
+			count = min_t(u64, count, data->args.len);
 
-		ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend,
-					      cached_state, disk_bytenr,
-					      disk_io_size, count,
-					      args.compression, iov, cmd);
+		ret = btrfs_uring_read_extent(&kiocb, &data->iter, start, lockend,
+					      cached_state, disk_bytenr, disk_io_size,
+					      count, data->args.compression,
+					      data->iov, cmd);
 
 		goto out_acct;
 	}
 
 out_free:
-	kfree(iov);
+	kfree(data->iov);
 
 out_acct:
 	if (ret > 0)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 204c928beaf9..531312efee8d 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1541,6 +1541,10 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
 	u64 extent_gen;
 	int ret;
 
+	if (unlikely(!extent_root)) {
+		btrfs_err(fs_info, "no valid extent root for scrub");
+		return -EUCLEAN;
+	}
 	memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) *
 				   stripe->nr_sectors);
 	scrub_stripe_reset_bitmaps(stripe);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index ddf0d5a448a7..c9e92c6941ec 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -174,10 +174,10 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
 					copy_page(workspace->buf + i * PAGE_SIZE,
 						  data_in);
 					start += PAGE_SIZE;
-					workspace->strm.avail_in =
-						(in_buf_folios << PAGE_SHIFT);
 				}
 				workspace->strm.next_in = workspace->buf;
+				workspace->strm.avail_in = min(bytes_left,
+							       in_buf_folios << PAGE_SHIFT);
 			} else {
 				unsigned int pg_off;
 				unsigned int cur_len;
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 11ed523e528e..df905ae82929 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -748,8 +748,9 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
 		     (u64)lim->max_segments << PAGE_SHIFT),
 		fs_info->sectorsize);
 	fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
-	if (fs_info->max_zone_append_size < fs_info->max_extent_size)
-		fs_info->max_extent_size = fs_info->max_zone_append_size;
+
+	fs_info->max_extent_size = min_not_zero(fs_info->max_extent_size,
+						fs_info->max_zone_append_size);
 
 	/*
 	 * Check mount options here, because we might change fs_info->zoned
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 89b11336a836..1806bff8e59b 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -15,6 +15,7 @@
 #include <linux/namei.h>
 #include <linux/poll.h>
 #include <linux/mount.h>
+#include <linux/security.h>
 #include <linux/statfs.h>
 #include <linux/ctype.h>
 #include <linux/string.h>
@@ -576,7 +577,7 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
  */
 static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
 {
-	char *secctx;
+	int err;
 
 	_enter(",%s", args);
 
@@ -585,16 +586,16 @@ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
 		return -EINVAL;
 	}
 
-	if (cache->secctx) {
+	if (cache->have_secid) {
 		pr_err("Second security context specified\n");
 		return -EINVAL;
 	}
 
-	secctx = kstrdup(args, GFP_KERNEL);
-	if (!secctx)
-		return -ENOMEM;
+	err = security_secctx_to_secid(args, strlen(args), &cache->secid);
+	if (err)
+		return err;
 
-	cache->secctx = secctx;
+	cache->have_secid = true;
 	return 0;
 }
 
@@ -820,7 +821,6 @@ static void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
 	put_cred(cache->cache_cred);
 
 	kfree(cache->rootdirname);
-	kfree(cache->secctx);
 	kfree(cache->tag);
 
 	_leave("");
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 7b99bd98de75..38c236e38cef 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -122,7 +122,6 @@ struct cachefiles_cache {
 #define CACHEFILES_STATE_CHANGED	3	/* T if state changed (poll trigger) */
 #define CACHEFILES_ONDEMAND_MODE	4	/* T if in on-demand read mode */
 	char				*rootdirname;	/* name of cache root directory */
-	char				*secctx;	/* LSM security context */
 	char				*tag;		/* cache binding tag */
 	refcount_t			unbind_pincount;/* refcount to do daemon unbind */
 	struct xarray			reqs;		/* xarray of pending on-demand requests */
@@ -130,6 +129,8 @@ struct cachefiles_cache {
 	struct xarray			ondemand_ids;	/* xarray for ondemand_id allocation */
 	u32				ondemand_id_next;
 	u32				msg_id_next;
+	u32				secid;		/* LSM security id */
+	bool				have_secid;	/* whether "secid" was set */
 };
 
 static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache)
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
index fe777164f1d8..fc6611886b3b 100644
--- a/fs/cachefiles/security.c
+++ b/fs/cachefiles/security.c
@@ -18,7 +18,7 @@ int cachefiles_get_security_ID(struct cachefiles_cache *cache)
 	struct cred *new;
 	int ret;
 
-	_enter("{%s}", cache->secctx);
+	_enter("{%u}", cache->have_secid ? cache->secid : 0);
 
 	new = prepare_kernel_cred(current);
 	if (!new) {
@@ -26,8 +26,8 @@ int cachefiles_get_security_ID(struct cachefiles_cache *cache)
 		goto error;
 	}
 
-	if (cache->secctx) {
-		ret = set_security_override_from_ctx(new, cache->secctx);
+	if (cache->have_secid) {
+		ret = set_security_override(new, cache->secid);
 		if (ret < 0) {
 			put_cred(new);
 			pr_err("Security denies permission to nominate security context: error %d\n",
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index fe0a9b8a0cd0..3103b932b674 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -122,7 +122,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
 			type = exfat_get_entry_type(ep);
 			if (type == TYPE_UNUSED) {
 				brelse(bh);
-				break;
+				goto out;
 			}
 
 			if (type != TYPE_FILE && type != TYPE_DIR) {
@@ -170,6 +170,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
 		}
 	}
 
+out:
 	dir_entry->namebuf.lfn[0] = '\0';
 	*cpos = EXFAT_DEN_TO_B(dentry);
 	return 0;
diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
index 773c320d68f3..9e5492ac409b 100644
--- a/fs/exfat/fatent.c
+++ b/fs/exfat/fatent.c
@@ -216,6 +216,16 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain
 
 			if (err)
 				goto dec_used_clus;
+
+			if (num_clusters >= sbi->num_clusters - EXFAT_FIRST_CLUSTER) {
+				/*
+				 * The cluster chain includes a loop, scan the
+				 * bitmap to get the number of used clusters.
+				 */
+				exfat_count_used_clusters(sb, &sbi->used_clusters);
+
+				return 0;
+			}
 		} while (clu != EXFAT_EOF_CLUSTER);
 	}
 
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index fb38769c3e39..05b51e721783 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -545,6 +545,7 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size)
 	while (pos < new_valid_size) {
 		u32 len;
 		struct folio *folio;
+		unsigned long off;
 
 		len = PAGE_SIZE - (pos & (PAGE_SIZE - 1));
 		if (pos + len > new_valid_size)
@@ -554,6 +555,9 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size)
 		if (err)
 			goto out;
 
+		off = offset_in_folio(folio, pos);
+		folio_zero_new_buffers(folio, off, off + len);
+
 		err = ops->write_end(file, mapping, pos, len, len, folio, NULL);
 		if (err < 0)
 			goto out;
@@ -563,6 +567,8 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size)
 		cond_resched();
 	}
 
+	return 0;
+
 out:
 	return err;
 }
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 97d2774760fe..099f80645072 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -330,8 +330,8 @@ static int exfat_find_empty_entry(struct inode *inode,
 
 	while ((dentry = exfat_search_empty_slot(sb, &hint_femp, p_dir,
 					num_entries, es)) < 0) {
-		if (dentry == -EIO)
-			break;
+		if (dentry != -ENOSPC)
+			return dentry;
 
 		if (exfat_check_max_dentries(inode))
 			return -ENOSPC;
diff --git a/fs/file.c b/fs/file.c
index fb1011cf6b4a..25c6e53b03f8 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -22,6 +22,7 @@
 #include <linux/close_range.h>
 #include <linux/file_ref.h>
 #include <net/sock.h>
+#include <linux/init_task.h>
 
 #include "internal.h"
 
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 494ac372ace0..e540d05549ff 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1681,6 +1681,8 @@ static int fuse_dir_open(struct inode *inode, struct file *file)
 		 */
 		if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE))
 			nonseekable_open(inode, file);
+		if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+			invalidate_inode_pages2(inode->i_mapping);
 	}
 
 	return err;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 3bee9b5dba5e..fe09c2093a93 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -349,11 +349,13 @@ static int hfs_fill_super(struct super_block *sb, struct fs_context *fc)
 		goto bail_no_root;
 	res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd);
 	if (!res) {
-		if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) {
+		if (fd.entrylength != sizeof(rec.dir)) {
 			res =  -EIO;
 			goto bail_hfs_find;
 		}
 		hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength);
+		if (rec.type != HFS_CDR_DIR)
+			res = -EIO;
 	}
 	if (res)
 		goto bail_hfs_find;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 955f19e27e47..54dc27d92781 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1774,7 +1774,8 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
  */
 static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
 		struct writeback_control *wbc, struct folio *folio,
-		struct inode *inode, loff_t pos, unsigned len)
+		struct inode *inode, loff_t pos, loff_t end_pos,
+		unsigned len)
 {
 	struct iomap_folio_state *ifs = folio->private;
 	size_t poff = offset_in_folio(folio, pos);
@@ -1793,15 +1794,60 @@ new_ioend:
 
 	if (ifs)
 		atomic_add(len, &ifs->write_bytes_pending);
+
+	/*
+	 * Clamp io_offset and io_size to the incore EOF so that ondisk
+	 * file size updates in the ioend completion are byte-accurate.
+	 * This avoids recovering files with zeroed tail regions when
+	 * writeback races with appending writes:
+	 *
+	 *    Thread 1:                  Thread 2:
+	 *    ------------               -----------
+	 *    write [A, A+B]
+	 *    update inode size to A+B
+	 *    submit I/O [A, A+BS]
+	 *                               write [A+B, A+B+C]
+	 *                               update inode size to A+B+C
+	 *    <I/O completes, updates disk size to min(A+B+C, A+BS)>
+	 *    <power failure>
+	 *
+	 *  After reboot:
+	 *    1) with A+B+C < A+BS, the file has zero padding in range
+	 *       [A+B, A+B+C]
+	 *
+	 *    |<     Block Size (BS)   >|
+	 *    |DDDDDDDDDDDD0000000000000|
+	 *    ^           ^        ^
+	 *    A          A+B     A+B+C
+	 *                       (EOF)
+	 *
+	 *    2) with A+B+C > A+BS, the file has zero padding in range
+	 *       [A+B, A+BS]
+	 *
+	 *    |<     Block Size (BS)   >|<     Block Size (BS)    >|
+	 *    |DDDDDDDDDDDD0000000000000|00000000000000000000000000|
+	 *    ^           ^             ^           ^
+	 *    A          A+B           A+BS       A+B+C
+	 *                             (EOF)
+	 *
+	 *    D = Valid Data
+	 *    0 = Zero Padding
+	 *
+	 * Note that this defeats the ability to chain the ioends of
+	 * appending writes.
+	 */
 	wpc->ioend->io_size += len;
+	if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
+		wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;
+
 	wbc_account_cgroup_owner(wbc, folio, len);
 	return 0;
 }
 
 static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
 		struct writeback_control *wbc, struct folio *folio,
-		struct inode *inode, u64 pos, unsigned dirty_len,
-		unsigned *count)
+		struct inode *inode, u64 pos, u64 end_pos,
+		unsigned dirty_len, unsigned *count)
 {
 	int error;
 
@@ -1826,7 +1872,7 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
 			break;
 		default:
 			error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos,
-					map_len);
+					end_pos, map_len);
 			if (!error)
 				(*count)++;
 			break;
@@ -1897,11 +1943,11 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
 		 *    remaining memory is zeroed when mapped, and writes to that
 		 *    region are not written out to the file.
 		 *
-		 * Also adjust the writeback range to skip all blocks entirely
-		 * beyond i_size.
+		 * Also adjust the end_pos to the end of file and skip writeback
+		 * for all blocks entirely beyond i_size.
 		 */
 		folio_zero_segment(folio, poff, folio_size(folio));
-		*end_pos = round_up(isize, i_blocksize(inode));
+		*end_pos = isize;
 	}
 
 	return true;
@@ -1914,6 +1960,7 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	struct inode *inode = folio->mapping->host;
 	u64 pos = folio_pos(folio);
 	u64 end_pos = pos + folio_size(folio);
+	u64 end_aligned = 0;
 	unsigned count = 0;
 	int error = 0;
 	u32 rlen;
@@ -1955,9 +2002,10 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	/*
 	 * Walk through the folio to find dirty areas to write back.
 	 */
-	while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) {
+	end_aligned = round_up(end_pos, i_blocksize(inode));
+	while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) {
 		error = iomap_writepage_map_blocks(wpc, wbc, folio, inode,
-				pos, rlen, &count);
+				pos, end_pos, rlen, &count);
 		if (error)
 			break;
 		pos += rlen;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 9153ff3a08e7..e8e80761ac73 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -772,9 +772,9 @@ start_journal_io:
 	/*
 	 * If the journal is not located on the file system device,
 	 * then we must flush the file system device before we issue
-	 * the commit record
+	 * the commit record and update the journal tail sequence.
 	 */
-	if (commit_transaction->t_need_data_flush &&
+	if ((commit_transaction->t_need_data_flush || update_tail) &&
 	    (journal->j_fs_dev != journal->j_dev) &&
 	    (journal->j_flags & JBD2_BARRIER))
 		blkdev_issue_flush(journal->j_fs_dev);
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 4556e4689024..ce63d5fde9c3 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -654,7 +654,7 @@ static void flush_descriptor(journal_t *journal,
 	set_buffer_jwrite(descriptor);
 	BUFFER_TRACE(descriptor, "write");
 	set_buffer_dirty(descriptor);
-	write_dirty_buffer(descriptor, REQ_SYNC);
+	write_dirty_buffer(descriptor, JBD2_JOURNAL_REQ_FLAGS);
 }
 #endif
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 23e81c2a1e3f..6eec7794f707 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2055,9 +2055,15 @@ SYSCALL_DEFINE1(oldumount, char __user *, name)
 
 static bool is_mnt_ns_file(struct dentry *dentry)
 {
+	struct ns_common *ns;
+
 	/* Is this a proxy for a mount namespace? */
-	return dentry->d_op == &ns_dentry_operations &&
-	       dentry->d_fsdata == &mntns_operations;
+	if (dentry->d_op != &ns_dentry_operations)
+		return false;
+
+	ns = d_inode(dentry)->i_private;
+
+	return ns->ops == &mntns_operations;
 }
 
 struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 7ac34550c403..4dc9b8286355 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -275,22 +275,14 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
 			netfs_stat(&netfs_n_rh_download);
 			if (rreq->netfs_ops->prepare_read) {
 				ret = rreq->netfs_ops->prepare_read(subreq);
-				if (ret < 0) {
-					atomic_dec(&rreq->nr_outstanding);
-					netfs_put_subrequest(subreq, false,
-							     netfs_sreq_trace_put_cancel);
-					break;
-				}
+				if (ret < 0)
+					goto prep_failed;
 				trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
 			}
 
 			slice = netfs_prepare_read_iterator(subreq);
-			if (slice < 0) {
-				atomic_dec(&rreq->nr_outstanding);
-				netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
-				ret = slice;
-				break;
-			}
+			if (slice < 0)
+				goto prep_iter_failed;
 
 			rreq->netfs_ops->issue_read(subreq);
 			goto done;
@@ -302,6 +294,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
 			trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
 			netfs_stat(&netfs_n_rh_zero);
 			slice = netfs_prepare_read_iterator(subreq);
+			if (slice < 0)
+				goto prep_iter_failed;
 			__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
 			netfs_read_subreq_terminated(subreq, 0, false);
 			goto done;
@@ -310,6 +304,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
 		if (source == NETFS_READ_FROM_CACHE) {
 			trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
 			slice = netfs_prepare_read_iterator(subreq);
+			if (slice < 0)
+				goto prep_iter_failed;
 			netfs_read_cache_to_pagecache(rreq, subreq);
 			goto done;
 		}
@@ -318,6 +314,14 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
 		WARN_ON_ONCE(1);
 		break;
 
+	prep_iter_failed:
+		ret = slice;
+	prep_failed:
+		subreq->error = ret;
+		atomic_dec(&rreq->nr_outstanding);
+		netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
+		break;
+
 	done:
 		size -= slice;
 		start += slice;
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index 88f2adfab75e..173e8b5e6a93 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -104,7 +104,6 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
 		trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip);
 		wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
 			    TASK_UNINTERRUPTIBLE);
-		smp_rmb(); /* Read error/transferred after RIP flag */
 		ret = wreq->error;
 		if (ret == 0) {
 			ret = wreq->transferred;
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index 3cbb289535a8..e8624f5c7fcc 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -62,10 +62,14 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
 		} else {
 			trace_netfs_folio(folio, netfs_folio_trace_read_done);
 		}
+
+		folioq_clear(folioq, slot);
 	} else {
 		// TODO: Use of PG_private_2 is deprecated.
 		if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
 			netfs_pgpriv2_mark_copy_to_cache(subreq, rreq, folioq, slot);
+		else
+			folioq_clear(folioq, slot);
 	}
 
 	if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
@@ -77,8 +81,6 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
 			folio_unlock(folio);
 		}
 	}
-
-	folioq_clear(folioq, slot);
 }
 
 /*
@@ -247,16 +249,17 @@ donation_changed:
 
 	/* Deal with the trickiest case: that this subreq is in the middle of a
 	 * folio, not touching either edge, but finishes first.  In such a
-	 * case, we donate to the previous subreq, if there is one, so that the
-	 * donation is only handled when that completes - and remove this
-	 * subreq from the list.
+	 * case, we donate to the previous subreq, if there is one and if it is
+	 * contiguous, so that the donation is only handled when that completes
+	 * - and remove this subreq from the list.
 	 *
 	 * If the previous subreq finished first, we will have acquired their
 	 * donation and should be able to unlock folios and/or donate nextwards.
 	 */
 	if (!subreq->consumed &&
 	    !prev_donated &&
-	    !list_is_first(&subreq->rreq_link, &rreq->subrequests)) {
+	    !list_is_first(&subreq->rreq_link, &rreq->subrequests) &&
+	    subreq->start == prev->start + prev->len) {
 		prev = list_prev_entry(subreq, rreq_link);
 		WRITE_ONCE(prev->next_donated, prev->next_donated + subreq->len);
 		subreq->start += subreq->len;
@@ -378,8 +381,7 @@ static void netfs_rreq_assess(struct netfs_io_request *rreq)
 	task_io_account_read(rreq->transferred);
 
 	trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
-	clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
-	wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
+	clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
 
 	trace_netfs_rreq(rreq, netfs_rreq_trace_done);
 	netfs_clear_subrequests(rreq, false);
@@ -438,7 +440,7 @@ void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq,
 	     rreq->origin == NETFS_READPAGE ||
 	     rreq->origin == NETFS_READ_FOR_WRITE)) {
 		netfs_consume_read_data(subreq, was_async);
-		__clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+		__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
 	}
 }
 EXPORT_SYMBOL(netfs_read_subreq_progress);
@@ -497,7 +499,7 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq,
 		     rreq->origin == NETFS_READPAGE ||
 		     rreq->origin == NETFS_READ_FOR_WRITE)) {
 			netfs_consume_read_data(subreq, was_async);
-			__clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+			__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
 		}
 		rreq->transferred += subreq->transferred;
 	}
@@ -511,10 +513,13 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq,
 		} else {
 			trace_netfs_sreq(subreq, netfs_sreq_trace_short);
 			if (subreq->transferred > subreq->consumed) {
-				__set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
-				__clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
-				set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
-			} else if (!__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
+				/* If we didn't read new data, abandon retry. */
+				if (subreq->retry_count &&
+				    test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) {
+					__set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+					set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
+				}
+			} else if (test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) {
 				__set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
 				set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
 			} else {
diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c
index ba5af89d37fa..54d5004fec18 100644
--- a/fs/netfs/read_pgpriv2.c
+++ b/fs/netfs/read_pgpriv2.c
@@ -170,6 +170,10 @@ void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq)
 
 	trace_netfs_write(wreq, netfs_write_trace_copy_to_cache);
 	netfs_stat(&netfs_n_wh_copy_to_cache);
+	if (!wreq->io_streams[1].avail) {
+		netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+		goto couldnt_start;
+	}
 
 	for (;;) {
 		error = netfs_pgpriv2_copy_folio(wreq, folio);
diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c
index 0350592ea804..21b4a54e545e 100644
--- a/fs/netfs/read_retry.c
+++ b/fs/netfs/read_retry.c
@@ -49,13 +49,15 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
 	 * up to the first permanently failed one.
 	 */
 	if (!rreq->netfs_ops->prepare_read &&
-	    !test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags)) {
+	    !rreq->cache_resources.ops) {
 		struct netfs_io_subrequest *subreq;
 
 		list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
 			if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
 				break;
 			if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
+				__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
+				subreq->retry_count++;
 				netfs_reset_iter(subreq);
 				netfs_reissue_read(rreq, subreq);
 			}
@@ -137,7 +139,8 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
 			stream0->sreq_max_len = subreq->len;
 
 			__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
-			__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
+			__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
+			subreq->retry_count++;
 
 			spin_lock_bh(&rreq->lock);
 			list_add_tail(&subreq->rreq_link, &rreq->subrequests);
@@ -213,7 +216,6 @@ abandon:
 			subreq->error = -ENOMEM;
 		__clear_bit(NETFS_SREQ_FAILED, &subreq->flags);
 		__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
-		__clear_bit(NETFS_SREQ_RETRYING, &subreq->flags);
 	}
 	spin_lock_bh(&rreq->lock);
 	list_splice_tail_init(&queue, &rreq->subrequests);
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index 1d438be2e1b4..ca3a11ed9b54 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -179,7 +179,6 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
 				struct iov_iter source = subreq->io_iter;
 
 				iov_iter_revert(&source, subreq->len - source.count);
-				__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
 				netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
 				netfs_reissue_write(stream, subreq, &source);
 			}
@@ -234,7 +233,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
 			/* Renegotiate max_len (wsize) */
 			trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
 			__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
-			__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
+			subreq->retry_count++;
 			stream->prepare_write(subreq);
 
 			part = min(len, stream->sreq_max_len);
@@ -279,7 +278,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
 			subreq->start		= start;
 			subreq->debug_index	= atomic_inc_return(&wreq->subreq_counter);
 			subreq->stream_nr	= to->stream_nr;
-			__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
+			subreq->retry_count	= 1;
 
 			trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
 					     refcount_read(&subreq->ref),
@@ -501,8 +500,7 @@ reassess_streams:
 		goto need_retry;
 	if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
 		trace_netfs_rreq(wreq, netfs_rreq_trace_unpause);
-		clear_bit_unlock(NETFS_RREQ_PAUSE, &wreq->flags);
-		wake_up_bit(&wreq->flags, NETFS_RREQ_PAUSE);
+		clear_and_wake_up_bit(NETFS_RREQ_PAUSE, &wreq->flags);
 	}
 
 	if (notes & NEED_REASSESS) {
@@ -605,8 +603,7 @@ void netfs_write_collection_worker(struct work_struct *work)
 
 	_debug("finished");
 	trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
-	clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
-	wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS);
+	clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
 
 	if (wreq->iocb) {
 		size_t written = min(wreq->transferred, wreq->len);
@@ -714,8 +711,7 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
 
 	trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
 
-	clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
-	wake_up_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS);
+	clear_and_wake_up_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
 
 	/* If we are at the head of the queue, wake up the collector,
 	 * transferring a ref to it if we were the ones to do so.
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index bf6d507578e5..ff0e82505a0b 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -244,6 +244,8 @@ void netfs_reissue_write(struct netfs_io_stream *stream,
 	iov_iter_advance(source, size);
 	iov_iter_truncate(&subreq->io_iter, size);
 
+	subreq->retry_count++;
+	__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
 	__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
 	netfs_do_issue_write(stream, subreq);
 }
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 810269ee0a50..d49e4ce27999 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -263,6 +263,12 @@ int nfs_netfs_readahead(struct readahead_control *ractl)
 static atomic_t nfs_netfs_debug_id;
 static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *file)
 {
+	if (!file) {
+		if (WARN_ON_ONCE(rreq->origin != NETFS_PGPRIV2_COPY_TO_CACHE))
+			return -EIO;
+		return 0;
+	}
+
 	rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file));
 	rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id);
 	/* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */
@@ -274,7 +280,8 @@ static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *fi
 
 static void nfs_netfs_free_request(struct netfs_io_request *rreq)
 {
-	put_nfs_open_context(rreq->netfs_priv);
+	if (rreq->netfs_priv)
+		put_nfs_open_context(rreq->netfs_priv);
 }
 
 static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq)
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index dec553034027..e933f9c65d90 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -47,10 +47,8 @@ static void show_mark_fhandle(struct seq_file *m, struct inode *inode)
 	size = f->handle_bytes >> 2;
 
 	ret = exportfs_encode_fid(inode, (struct fid *)f->f_handle, &size);
-	if ((ret == FILEID_INVALID) || (ret < 0)) {
-		WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
+	if ((ret == FILEID_INVALID) || (ret < 0))
 		return;
-	}
 
 	f->handle_type = ret;
 	f->handle_bytes = size * sizeof(u32);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 2b0daced98eb..3404e7a30c33 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -893,7 +893,7 @@ static int ocfs2_get_next_id(struct super_block *sb, struct kqid *qid)
 	int status = 0;
 
 	trace_ocfs2_get_next_id(from_kqid(&init_user_ns, *qid), type);
-	if (!sb_has_quota_loaded(sb, type)) {
+	if (!sb_has_quota_active(sb, type)) {
 		status = -ESRCH;
 		goto out;
 	}
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 73d3367c533b..2956d888c131 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -867,6 +867,7 @@ out:
 	brelse(oinfo->dqi_libh);
 	brelse(oinfo->dqi_lqi_bh);
 	kfree(oinfo);
+	info->dqi_priv = NULL;
 	return status;
 }
 
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 3601ddfeddc2..0c28e5fa3407 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -415,13 +415,13 @@ int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upperdentry,
 	return err;
 }
 
-struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
+struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct inode *realinode,
 				  bool is_upper)
 {
 	struct ovl_fh *fh;
 	int fh_type, dwords;
 	int buflen = MAX_HANDLE_SZ;
-	uuid_t *uuid = &real->d_sb->s_uuid;
+	uuid_t *uuid = &realinode->i_sb->s_uuid;
 	int err;
 
 	/* Make sure the real fid stays 32bit aligned */
@@ -438,13 +438,13 @@ struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
 	 * the price or reconnecting the dentry.
 	 */
 	dwords = buflen >> 2;
-	fh_type = exportfs_encode_fh(real, (void *)fh->fb.fid, &dwords, 0);
+	fh_type = exportfs_encode_inode_fh(realinode, (void *)fh->fb.fid,
+					   &dwords, NULL, 0);
 	buflen = (dwords << 2);
 
 	err = -EIO;
-	if (WARN_ON(fh_type < 0) ||
-	    WARN_ON(buflen > MAX_HANDLE_SZ) ||
-	    WARN_ON(fh_type == FILEID_INVALID))
+	if (fh_type < 0 || fh_type == FILEID_INVALID ||
+	    WARN_ON(buflen > MAX_HANDLE_SZ))
 		goto out_err;
 
 	fh->fb.version = OVL_FH_VERSION;
@@ -480,7 +480,7 @@ struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin)
 	if (!ovl_can_decode_fh(origin->d_sb))
 		return NULL;
 
-	return ovl_encode_real_fh(ofs, origin, false);
+	return ovl_encode_real_fh(ofs, d_inode(origin), false);
 }
 
 int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh,
@@ -505,7 +505,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper,
 	const struct ovl_fh *fh;
 	int err;
 
-	fh = ovl_encode_real_fh(ofs, upper, true);
+	fh = ovl_encode_real_fh(ofs, d_inode(upper), true);
 	if (IS_ERR(fh))
 		return PTR_ERR(fh);
 
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 5868cb222955..444aeeccb6da 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -176,35 +176,37 @@ static int ovl_connect_layer(struct dentry *dentry)
  *
  * Return 0 for upper file handle, > 0 for lower file handle or < 0 on error.
  */
-static int ovl_check_encode_origin(struct dentry *dentry)
+static int ovl_check_encode_origin(struct inode *inode)
 {
-	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+	struct ovl_fs *ofs = OVL_FS(inode->i_sb);
 	bool decodable = ofs->config.nfs_export;
+	struct dentry *dentry;
+	int err;
 
 	/* No upper layer? */
 	if (!ovl_upper_mnt(ofs))
 		return 1;
 
 	/* Lower file handle for non-upper non-decodable */
-	if (!ovl_dentry_upper(dentry) && !decodable)
+	if (!ovl_inode_upper(inode) && !decodable)
 		return 1;
 
 	/* Upper file handle for pure upper */
-	if (!ovl_dentry_lower(dentry))
+	if (!ovl_inode_lower(inode))
 		return 0;
 
 	/*
 	 * Root is never indexed, so if there's an upper layer, encode upper for
 	 * root.
 	 */
-	if (dentry == dentry->d_sb->s_root)
+	if (inode == d_inode(inode->i_sb->s_root))
 		return 0;
 
 	/*
 	 * Upper decodable file handle for non-indexed upper.
 	 */
-	if (ovl_dentry_upper(dentry) && decodable &&
-	    !ovl_test_flag(OVL_INDEX, d_inode(dentry)))
+	if (ovl_inode_upper(inode) && decodable &&
+	    !ovl_test_flag(OVL_INDEX, inode))
 		return 0;
 
 	/*
@@ -213,14 +215,23 @@ static int ovl_check_encode_origin(struct dentry *dentry)
 	 * ovl_connect_layer() will try to make origin's layer "connected" by
 	 * copying up a "connectable" ancestor.
 	 */
-	if (d_is_dir(dentry) && decodable)
-		return ovl_connect_layer(dentry);
+	if (!decodable || !S_ISDIR(inode->i_mode))
+		return 1;
+
+	dentry = d_find_any_alias(inode);
+	if (!dentry)
+		return -ENOENT;
+
+	err = ovl_connect_layer(dentry);
+	dput(dentry);
+	if (err < 0)
+		return err;
 
 	/* Lower file handle for indexed and non-upper dir/non-dir */
 	return 1;
 }
 
-static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry,
+static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct inode *inode,
 			     u32 *fid, int buflen)
 {
 	struct ovl_fh *fh = NULL;
@@ -231,13 +242,13 @@ static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry,
 	 * Check if we should encode a lower or upper file handle and maybe
 	 * copy up an ancestor to make lower file handle connectable.
 	 */
-	err = enc_lower = ovl_check_encode_origin(dentry);
+	err = enc_lower = ovl_check_encode_origin(inode);
 	if (enc_lower < 0)
 		goto fail;
 
 	/* Encode an upper or lower file handle */
-	fh = ovl_encode_real_fh(ofs, enc_lower ? ovl_dentry_lower(dentry) :
-				ovl_dentry_upper(dentry), !enc_lower);
+	fh = ovl_encode_real_fh(ofs, enc_lower ? ovl_inode_lower(inode) :
+				ovl_inode_upper(inode), !enc_lower);
 	if (IS_ERR(fh))
 		return PTR_ERR(fh);
 
@@ -251,8 +262,8 @@ out:
 	return err;
 
 fail:
-	pr_warn_ratelimited("failed to encode file handle (%pd2, err=%i)\n",
-			    dentry, err);
+	pr_warn_ratelimited("failed to encode file handle (ino=%lu, err=%i)\n",
+			    inode->i_ino, err);
 	goto out;
 }
 
@@ -260,19 +271,13 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len,
 			 struct inode *parent)
 {
 	struct ovl_fs *ofs = OVL_FS(inode->i_sb);
-	struct dentry *dentry;
 	int bytes, buflen = *max_len << 2;
 
 	/* TODO: encode connectable file handles */
 	if (parent)
 		return FILEID_INVALID;
 
-	dentry = d_find_any_alias(inode);
-	if (!dentry)
-		return FILEID_INVALID;
-
-	bytes = ovl_dentry_to_fid(ofs, dentry, fid, buflen);
-	dput(dentry);
+	bytes = ovl_dentry_to_fid(ofs, inode, fid, buflen);
 	if (bytes <= 0)
 		return FILEID_INVALID;
 
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 7e27b7d4adee..cea820cb3b55 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -542,7 +542,7 @@ int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry,
 	struct ovl_fh *fh;
 	int err;
 
-	fh = ovl_encode_real_fh(ofs, real, is_upper);
+	fh = ovl_encode_real_fh(ofs, d_inode(real), is_upper);
 	err = PTR_ERR(fh);
 	if (IS_ERR(fh)) {
 		fh = NULL;
@@ -738,7 +738,7 @@ int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
 	struct ovl_fh *fh;
 	int err;
 
-	fh = ovl_encode_real_fh(ofs, origin, false);
+	fh = ovl_encode_real_fh(ofs, d_inode(origin), false);
 	if (IS_ERR(fh))
 		return PTR_ERR(fh);
 
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index b361f35762be..0021e2025020 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -865,7 +865,7 @@ int ovl_copy_up_with_data(struct dentry *dentry);
 int ovl_maybe_copy_up(struct dentry *dentry, int flags);
 int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentry *new);
 int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat);
-struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
+struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct inode *realinode,
 				  bool is_upper);
 struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin);
 int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 38a5a3e9cba2..f02cd362309a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1810,7 +1810,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		}
 
 		for (; addr != end; addr += PAGE_SIZE, idx++) {
-			unsigned long cur_flags = flags;
+			u64 cur_flags = flags;
 			pagemap_entry_t pme;
 
 			if (folio && (flags & PM_PRESENT) &&
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 85925ec0051a..3310d1ad4d0e 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -179,8 +179,7 @@ static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf)
  */
 static const char *qnx6_checkroot(struct super_block *s)
 {
-	static char match_root[2][3] = {".\0\0", "..\0"};
-	int i, error = 0;
+	int error = 0;
 	struct qnx6_dir_entry *dir_entry;
 	struct inode *root = d_inode(s->s_root);
 	struct address_space *mapping = root->i_mapping;
@@ -189,11 +188,9 @@ static const char *qnx6_checkroot(struct super_block *s)
 	if (IS_ERR(folio))
 		return "error reading root directory";
 	dir_entry = kmap_local_folio(folio, 0);
-	for (i = 0; i < 2; i++) {
-		/* maximum 3 bytes - due to match_root limitation */
-		if (strncmp(dir_entry[i].de_fname, match_root[i], 3))
-			error = 1;
-	}
+	if (memcmp(dir_entry[0].de_fname, ".", 2) ||
+	    memcmp(dir_entry[1].de_fname, "..", 3))
+		error = 1;
 	folio_release_kmap(folio, dir_entry);
 	if (error)
 		return "error reading root directory.";
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index bd42a419458e..6cb1e81993f8 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -1319,14 +1319,16 @@ cifs_readv_callback(struct mid_q_entry *mid)
 	}
 
 	if (rdata->result == -ENODATA) {
-		__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
 		rdata->result = 0;
+		__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
 	} else {
 		size_t trans = rdata->subreq.transferred + rdata->got_bytes;
 		if (trans < rdata->subreq.len &&
 		    rdata->subreq.start + trans == ictx->remote_i_size) {
-			__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
 			rdata->result = 0;
+			__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
+		} else if (rdata->got_bytes > 0) {
+			__set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags);
 		}
 	}
 
@@ -1670,10 +1672,13 @@ cifs_writev_callback(struct mid_q_entry *mid)
 		if (written > wdata->subreq.len)
 			written &= 0xFFFF;
 
-		if (written < wdata->subreq.len)
+		if (written < wdata->subreq.len) {
 			result = -ENOSPC;
-		else
+		} else {
 			result = written;
+			if (written > 0)
+				__set_bit(NETFS_SREQ_MADE_PROGRESS, &wdata->subreq.flags);
+		}
 		break;
 	case MID_REQUEST_SUBMITTED:
 	case MID_RETRY_NEEDED:
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 959359301250..0577556f0a41 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -4615,6 +4615,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
 			__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
 			rdata->result = 0;
 		}
+		__set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags);
 	}
 	trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value,
 			      server->credits, server->in_flight,
@@ -4842,10 +4843,12 @@ smb2_writev_callback(struct mid_q_entry *mid)
 
 		cifs_stats_bytes_written(tcon, written);
 
-		if (written < wdata->subreq.len)
+		if (written < wdata->subreq.len) {
 			wdata->result = -ENOSPC;
-		else
+		} else if (written > 0) {
 			wdata->subreq.len = written;
+			__set_bit(NETFS_SREQ_MADE_PROGRESS, &wdata->subreq.flags);
+		}
 		break;
 	case MID_REQUEST_SUBMITTED:
 	case MID_RETRY_NEEDED:
@@ -5014,7 +5017,7 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
 	}
 #endif
 
-	if (test_bit(NETFS_SREQ_RETRYING, &wdata->subreq.flags))
+	if (wdata->subreq.retry_count > 0)
 		smb2_set_replay(server, &rqst);
 
 	cifs_dbg(FYI, "async write at %llu %u bytes iter=%zx\n",
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index f711bfd75c4d..4bf70cfec826 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -412,6 +412,29 @@ struct ethtool_eth_phy_stats {
 	);
 };
 
+/**
+ * struct ethtool_phy_stats - PHY-level statistics counters
+ * @rx_packets: Total successfully received frames
+ * @rx_bytes: Total successfully received bytes
+ * @rx_errors: Total received frames with errors (e.g., CRC errors)
+ * @tx_packets: Total successfully transmitted frames
+ * @tx_bytes: Total successfully transmitted bytes
+ * @tx_errors: Total transmitted frames with errors
+ *
+ * This structure provides a standardized interface for reporting
+ * PHY-level statistics counters. It is designed to expose statistics
+ * commonly provided by PHYs but not explicitly defined in the IEEE
+ * 802.3 standard.
+ */
+struct ethtool_phy_stats {
+	u64 rx_packets;
+	u64 rx_bytes;
+	u64 rx_errors;
+	u64 tx_packets;
+	u64 tx_bytes;
+	u64 tx_errors;
+};
+
 /* Basic IEEE 802.3 MAC Ctrl statistics (30.3.3.*), not otherwise exposed
  * via a more targeted API.
  */
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index d495cbdb52cb..38456b42cdb5 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -176,6 +176,7 @@ struct netpoll;
  *	@real_dev_addr: address of underlying netdevice
  *	@dent: proc dir entry
  *	@vlan_pcpu_stats: ptr to percpu rx stats
+ *	@netpoll: netpoll instance "propagated" down to @real_dev
  */
 struct vlan_dev_priv {
 	unsigned int				nr_ingress_mappings;
@@ -414,6 +415,8 @@ static inline int __vlan_insert_tag(struct sk_buff *skb,
  * doesn't have to worry about freeing the original skb.
  *
  * Does not change skb->protocol so this function can be used during receive.
+ *
+ * Return: modified @skb on success, NULL on error (@skb is freed).
  */
 static inline struct sk_buff *vlan_insert_inner_tag(struct sk_buff *skb,
 						    __be16 vlan_proto,
@@ -443,6 +446,8 @@ static inline struct sk_buff *vlan_insert_inner_tag(struct sk_buff *skb,
  * doesn't have to worry about freeing the original skb.
  *
  * Does not change skb->protocol so this function can be used during receive.
+ *
+ * Return: modified @skb on success, NULL on error (@skb is freed).
  */
 static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb,
 					      __be16 vlan_proto, u16 vlan_tci)
@@ -461,6 +466,8 @@ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb,
  *
  * Following the skb_unshare() example, in case of error, the calling function
  * doesn't have to worry about freeing the original skb.
+ *
+ * Return: modified @skb on success, NULL on error (@skb is freed).
  */
 static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb,
 							__be16 vlan_proto,
@@ -582,7 +589,7 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
 }
 
 /**
- * vlan_get_protocol - get protocol EtherType.
+ * __vlan_get_protocol_offset() - get protocol EtherType.
  * @skb: skbuff to query
  * @type: first vlan protocol
  * @mac_offset: MAC offset
@@ -808,9 +815,11 @@ static inline netdev_features_t vlan_features_check(struct sk_buff *skb,
  * @h1: Pointer to vlan header
  * @h2: Pointer to vlan header
  *
- * Compare two vlan headers, returns 0 if equal.
+ * Compare two vlan headers.
  *
  * Please note that alignment of h1 & h2 are only guaranteed to be 16 bits.
+ *
+ * Return: 0 if equal, arbitrary non-zero value if not equal.
  */
 static inline unsigned long compare_vlan_header(const struct vlan_hdr *h1,
 						const struct vlan_hdr *h2)
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index 0d5448c0b86c..a3ce553413de 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -18,6 +18,11 @@ struct io_uring_cmd {
 	u8		pdu[32]; /* available inline for free use */
 };
 
+struct io_uring_cmd_data {
+	struct io_uring_sqe	sqes[2];
+	void			*op_data;
+};
+
 static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
 {
 	return sqe->cmd;
@@ -113,4 +118,9 @@ static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd
 	return cmd_to_io_kiocb(cmd)->tctx->task;
 }
 
+static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_uring_cmd *cmd)
+{
+	return cmd_to_io_kiocb(cmd)->async_data;
+}
+
 #endif /* _LINUX_IO_URING_CMD_H */
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 5675af6b740c..75bf54e76f3b 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -335,7 +335,7 @@ struct iomap_ioend {
 	u16			io_type;
 	u16			io_flags;	/* IOMAP_F_* */
 	struct inode		*io_inode;	/* file being written to */
-	size_t			io_size;	/* size of the extent */
+	size_t			io_size;	/* size of data within eof */
 	loff_t			io_offset;	/* offset in the file */
 	sector_t		io_sector;	/* start sector of ioend */
 	struct bio		io_bio;		/* MUST BE LAST! */
diff --git a/include/linux/memfd.h b/include/linux/memfd.h
index 3f2cf339ceaf..d437e3070850 100644
--- a/include/linux/memfd.h
+++ b/include/linux/memfd.h
@@ -7,6 +7,7 @@
 #ifdef CONFIG_MEMFD_CREATE
 extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg);
 struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx);
+unsigned int *memfd_file_seals_ptr(struct file *file);
 #else
 static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a)
 {
@@ -16,6 +17,19 @@ static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
 {
 	return ERR_PTR(-EINVAL);
 }
+
+static inline unsigned int *memfd_file_seals_ptr(struct file *file)
+{
+	return NULL;
+}
 #endif
 
+/* Retrieve memfd seals associated with the file, if any. */
+static inline unsigned int memfd_file_seals(struct file *file)
+{
+	unsigned int *sealsp = memfd_file_seals_ptr(file);
+
+	return sealsp ? *sealsp : 0;
+}
+
 #endif /* __LINUX_MEMFD_H */
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index ea48eb879a0f..fed666c5bd16 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -691,7 +691,6 @@ struct mlx5_timer {
 	struct timecounter         tc;
 	u32                        nominal_c_mult;
 	unsigned long              overflow_period;
-	struct delayed_work        overflow_work;
 };
 
 struct mlx5_clock {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 370f533da107..bb99a35fc6a2 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -7025,6 +7025,7 @@ struct mlx5_ifc_alloc_packet_reformat_context_out_bits {
 
 enum {
 	MLX5_REFORMAT_CONTEXT_ANCHOR_MAC_START = 0x1,
+	MLX5_REFORMAT_CONTEXT_ANCHOR_VLAN_START = 0x2,
 	MLX5_REFORMAT_CONTEXT_ANCHOR_IP_START = 0x7,
 	MLX5_REFORMAT_CONTEXT_ANCHOR_TCP_UDP_START = 0x9,
 };
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 338a76ce9083..b1c3db9cf355 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3125,6 +3125,7 @@ static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc)
 	if (!pmd_ptlock_init(ptdesc))
 		return false;
 	__folio_set_pgtable(folio);
+	ptdesc_pmd_pts_init(ptdesc);
 	lruvec_stat_add_folio(folio, NR_PAGETABLE);
 	return true;
 }
@@ -4101,6 +4102,37 @@ void mem_dump_obj(void *object);
 static inline void mem_dump_obj(void *object) {}
 #endif
 
+static inline bool is_write_sealed(int seals)
+{
+	return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
+}
+
+/**
+ * is_readonly_sealed - Checks whether write-sealed but mapped read-only,
+ *                      in which case writes should be disallowing moving
+ *                      forwards.
+ * @seals: the seals to check
+ * @vm_flags: the VMA flags to check
+ *
+ * Returns whether readonly sealed, in which case writess should be disallowed
+ * going forward.
+ */
+static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags)
+{
+	/*
+	 * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
+	 * MAP_SHARED and read-only, take care to not allow mprotect to
+	 * revert protections on such mappings. Do this only for shared
+	 * mappings. For private mappings, don't need to mask
+	 * VM_MAYWRITE as we still want them to be COW-writable.
+	 */
+	if (is_write_sealed(seals) &&
+	    ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED))
+		return true;
+
+	return false;
+}
+
 /**
  * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and
  *                    handle them.
@@ -4112,24 +4144,15 @@ static inline void mem_dump_obj(void *object) {}
  */
 static inline int seal_check_write(int seals, struct vm_area_struct *vma)
 {
-	if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
-		/*
-		 * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
-		 * write seals are active.
-		 */
-		if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
-			return -EPERM;
-
-		/*
-		 * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
-		 * MAP_SHARED and read-only, take care to not allow mprotect to
-		 * revert protections on such mappings. Do this only for shared
-		 * mappings. For private mappings, don't need to mask
-		 * VM_MAYWRITE as we still want them to be COW-writable.
-		 */
-		if (vma->vm_flags & VM_SHARED)
-			vm_flags_clear(vma, VM_MAYWRITE);
-	}
+	if (!is_write_sealed(seals))
+		return 0;
+
+	/*
+	 * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
+	 * write seals are active.
+	 */
+	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+		return -EPERM;
 
 	return 0;
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7361a8f3ab68..332cee285662 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -445,6 +445,7 @@ FOLIO_MATCH(compound_head, _head_2a);
  * @pt_index:         Used for s390 gmap.
  * @pt_mm:            Used for x86 pgds.
  * @pt_frag_refcount: For fragmented page table tracking. Powerpc only.
+ * @pt_share_count:   Used for HugeTLB PMD page table share count.
  * @_pt_pad_2:        Padding to ensure proper alignment.
  * @ptl:              Lock for the page table.
  * @__page_type:      Same as page->page_type. Unused for page tables.
@@ -471,6 +472,9 @@ struct ptdesc {
 		pgoff_t pt_index;
 		struct mm_struct *pt_mm;
 		atomic_t pt_frag_refcount;
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
+		atomic_t pt_share_count;
+#endif
 	};
 
 	union {
@@ -516,6 +520,32 @@ static_assert(sizeof(struct ptdesc) <= sizeof(struct page));
 	const struct page *:		(const struct ptdesc *)(p),	\
 	struct page *:			(struct ptdesc *)(p)))
 
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
+static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc)
+{
+	atomic_set(&ptdesc->pt_share_count, 0);
+}
+
+static inline void ptdesc_pmd_pts_inc(struct ptdesc *ptdesc)
+{
+	atomic_inc(&ptdesc->pt_share_count);
+}
+
+static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc)
+{
+	atomic_dec(&ptdesc->pt_share_count);
+}
+
+static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc)
+{
+	return atomic_read(&ptdesc->pt_share_count);
+}
+#else
+static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc)
+{
+}
+#endif
+
 /*
  * Used for sizing the vmemmap region on some architectures
  */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2593019ad5b1..dd8f6f8991fe 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2261,7 +2261,7 @@ struct net_device {
 	void 			*atalk_ptr;
 #endif
 #if IS_ENABLED(CONFIG_AX25)
-	void			*ax25_ptr;
+	struct ax25_dev	__rcu	*ax25_ptr;
 #endif
 #if IS_ENABLED(CONFIG_CFG80211)
 	struct wireless_dev	*ieee80211_ptr;
@@ -3238,7 +3238,6 @@ static inline void unregister_netdevice(struct net_device *dev)
 
 int netdev_refcnt_read(const struct net_device *dev);
 void free_netdev(struct net_device *dev);
-void init_dummy_netdev(struct net_device *dev);
 
 struct net_device *netdev_get_xmit_slave(struct net_device *dev,
 					 struct sk_buff *skb,
@@ -3252,7 +3251,6 @@ struct net_device *netdev_get_by_index(struct net *net, int ifindex,
 struct net_device *netdev_get_by_name(struct net *net, const char *name,
 				      netdevice_tracker *tracker, gfp_t gfp);
 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
-struct net_device *dev_get_by_napi_id(unsigned int napi_id);
 void netdev_copy_name(struct net_device *dev, char *name);
 
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
@@ -4295,7 +4293,7 @@ static inline bool netif_carrier_ok(const struct net_device *dev)
 
 unsigned long dev_trans_start(struct net_device *dev);
 
-void __netdev_watchdog_up(struct net_device *dev);
+void netdev_watchdog_up(struct net_device *dev);
 
 void netif_carrier_on(struct net_device *dev);
 void netif_carrier_off(struct net_device *dev);
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 5eaceef41e6c..ecdd5ced16a8 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -185,6 +185,7 @@ struct netfs_io_subrequest {
 	short			error;		/* 0 or error that occurred */
 	unsigned short		debug_index;	/* Index in list (for debugging output) */
 	unsigned int		nr_segs;	/* Number of segs in io_iter */
+	u8			retry_count;	/* The number of retries (0 on initial pass) */
 	enum netfs_io_source	source;		/* Where to read from/write to */
 	unsigned char		stream_nr;	/* I/O stream this belongs to */
 	unsigned char		curr_folioq_slot; /* Folio currently being read */
@@ -194,14 +195,13 @@ struct netfs_io_subrequest {
 #define NETFS_SREQ_COPY_TO_CACHE	0	/* Set if should copy the data to the cache */
 #define NETFS_SREQ_CLEAR_TAIL		1	/* Set if the rest of the read should be cleared */
 #define NETFS_SREQ_SEEK_DATA_READ	3	/* Set if ->read() should SEEK_DATA first */
-#define NETFS_SREQ_NO_PROGRESS		4	/* Set if we didn't manage to read any data */
+#define NETFS_SREQ_MADE_PROGRESS	4	/* Set if we transferred at least some data */
 #define NETFS_SREQ_ONDEMAND		5	/* Set if it's from on-demand read mode */
 #define NETFS_SREQ_BOUNDARY		6	/* Set if ends on hard boundary (eg. ceph object) */
 #define NETFS_SREQ_HIT_EOF		7	/* Set if short due to EOF */
 #define NETFS_SREQ_IN_PROGRESS		8	/* Unlocked when the subrequest completes */
 #define NETFS_SREQ_NEED_RETRY		9	/* Set if the filesystem requests a retry */
-#define NETFS_SREQ_RETRYING		10	/* Set if we're retrying */
-#define NETFS_SREQ_FAILED		11	/* Set if the subreq failed unretryably */
+#define NETFS_SREQ_FAILED		10	/* Set if the subreq failed unretryably */
 };
 
 enum netfs_io_origin {
@@ -269,7 +269,6 @@ struct netfs_io_request {
 	size_t			prev_donated;	/* Fallback for subreq->prev_donated */
 	refcount_t		ref;
 	unsigned long		flags;
-#define NETFS_RREQ_COPY_TO_CACHE	1	/* Need to write to the cache */
 #define NETFS_RREQ_NO_UNLOCK_FOLIO	2	/* Don't unlock no_unlock_folio on completion */
 #define NETFS_RREQ_DONT_UNLOCK_FOLIOS	3	/* Don't unlock the folios on completion */
 #define NETFS_RREQ_FAILED		4	/* The request failed */
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index b5b5d17998b8..733f4ddd2ef1 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -50,7 +50,6 @@ struct dw_xpcs;
 
 struct phylink_pcs *xpcs_to_phylink_pcs(struct dw_xpcs *xpcs);
 int xpcs_get_an_mode(struct dw_xpcs *xpcs, phy_interface_t interface);
-void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces);
 int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns,
 		    int enable);
 struct dw_xpcs *xpcs_create_mdiodev(struct mii_bus *bus, int addr);
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 35842d1e3879..5b520fe86b60 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -221,10 +221,7 @@ do {									\
 } while (0)
 
 #define PERCPU_PTR(__p)							\
-({									\
-	unsigned long __pcpu_ptr = (__force unsigned long)(__p);	\
-	(typeof(*(__p)) __force __kernel *)(__pcpu_ptr);		\
-})
+	(typeof(*(__p)) __force __kernel *)((__force unsigned long)(__p))
 
 #ifdef CONFIG_SMP
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 5bc71d59910c..afaae74d0949 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1144,6 +1144,53 @@ struct phy_driver {
 	int (*cable_test_get_status)(struct phy_device *dev, bool *finished);
 
 	/* Get statistics from the PHY using ethtool */
+	/**
+	 * @get_phy_stats: Retrieve PHY statistics.
+	 * @dev: The PHY device for which the statistics are retrieved.
+	 * @eth_stats: structure where Ethernet PHY stats will be stored.
+	 * @stats: structure where additional PHY-specific stats will be stored.
+	 *
+	 * Retrieves the supported PHY statistics and populates the provided
+	 * structures. The input structures are pre-initialized with
+	 * `ETHTOOL_STAT_NOT_SET`, and the driver must only modify members
+	 * corresponding to supported statistics. Unmodified members will remain
+	 * set to `ETHTOOL_STAT_NOT_SET` and will not be returned to userspace.
+	 */
+	void (*get_phy_stats)(struct phy_device *dev,
+			      struct ethtool_eth_phy_stats *eth_stats,
+			      struct ethtool_phy_stats *stats);
+
+	/**
+	 * @get_link_stats: Retrieve link statistics.
+	 * @dev: The PHY device for which the statistics are retrieved.
+	 * @link_stats: structure where link-specific stats will be stored.
+	 *
+	 * Retrieves link-related statistics for the given PHY device. The input
+	 * structure is pre-initialized with `ETHTOOL_STAT_NOT_SET`, and the
+	 * driver must only modify members corresponding to supported
+	 * statistics. Unmodified members will remain set to
+	 * `ETHTOOL_STAT_NOT_SET` and will not be returned to userspace.
+	 */
+	void (*get_link_stats)(struct phy_device *dev,
+			       struct ethtool_link_ext_stats *link_stats);
+
+	/**
+	 * @update_stats: Trigger periodic statistics updates.
+	 * @dev: The PHY device for which statistics updates are triggered.
+	 *
+	 * Periodically gathers statistics from the PHY device to update locally
+	 * maintained 64-bit counters. This is necessary for PHYs that implement
+	 * reduced-width counters (e.g., 16-bit or 32-bit) which can overflow
+	 * more frequently compared to 64-bit counters. By invoking this
+	 * callback, drivers can fetch the current counter values, handle
+	 * overflow detection, and accumulate the results into local 64-bit
+	 * counters for accurate reporting through the `get_phy_stats` and
+	 * `get_link_stats` interfaces.
+	 *
+	 * Return: 0 on success or a negative error code on failure.
+	 */
+	int (*update_stats)(struct phy_device *dev);
+
 	/** @get_sset_count: Number of statistic counters */
 	int (*get_sset_count)(struct phy_device *dev);
 	/** @get_strings: Names of the statistic counters */
@@ -1634,6 +1681,9 @@ static inline bool phy_polling_mode(struct phy_device *phydev)
 		if (phydev->drv->flags & PHY_POLL_CABLE_TEST)
 			return true;
 
+	if (phydev->drv->update_stats)
+		return true;
+
 	return phydev->irq == PHY_POLL;
 }
 
@@ -2096,6 +2146,7 @@ int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask);
 int phy_unregister_fixup_for_id(const char *bus_id);
 int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask);
 
+int phy_eee_rx_clock_stop(struct phy_device *phydev, bool clk_stop_enable);
 int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable);
 int phy_get_eee_err(struct phy_device *phydev);
 int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_keee *data);
@@ -2123,6 +2174,13 @@ int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data);
 int phy_ethtool_get_sset_count(struct phy_device *phydev);
 int phy_ethtool_get_stats(struct phy_device *phydev,
 			  struct ethtool_stats *stats, u64 *data);
+
+void __phy_ethtool_get_phy_stats(struct phy_device *phydev,
+			 struct ethtool_eth_phy_stats *phy_stats,
+			 struct ethtool_phy_stats *phydev_stats);
+void __phy_ethtool_get_link_ext_stats(struct phy_device *phydev,
+				      struct ethtool_link_ext_stats *link_stats);
+
 int phy_ethtool_get_plca_cfg(struct phy_device *phydev,
 			     struct phy_plca_cfg *plca_cfg);
 int phy_ethtool_set_plca_cfg(struct phy_device *phydev,
diff --git a/include/linux/phylib_stubs.h b/include/linux/phylib_stubs.h
index 1279f48c8a70..9d2d6090c86d 100644
--- a/include/linux/phylib_stubs.h
+++ b/include/linux/phylib_stubs.h
@@ -5,6 +5,9 @@
 
 #include <linux/rtnetlink.h>
 
+struct ethtool_eth_phy_stats;
+struct ethtool_link_ext_stats;
+struct ethtool_phy_stats;
 struct kernel_hwtstamp_config;
 struct netlink_ext_ack;
 struct phy_device;
@@ -19,6 +22,11 @@ struct phylib_stubs {
 	int (*hwtstamp_set)(struct phy_device *phydev,
 			    struct kernel_hwtstamp_config *config,
 			    struct netlink_ext_ack *extack);
+	void (*get_phy_stats)(struct phy_device *phydev,
+			      struct ethtool_eth_phy_stats *phy_stats,
+			      struct ethtool_phy_stats *phydev_stats);
+	void (*get_link_ext_stats)(struct phy_device *phydev,
+				   struct ethtool_link_ext_stats *link_stats);
 };
 
 static inline int phy_hwtstamp_get(struct phy_device *phydev,
@@ -50,6 +58,29 @@ static inline int phy_hwtstamp_set(struct phy_device *phydev,
 	return phylib_stubs->hwtstamp_set(phydev, config, extack);
 }
 
+static inline void phy_ethtool_get_phy_stats(struct phy_device *phydev,
+					struct ethtool_eth_phy_stats *phy_stats,
+					struct ethtool_phy_stats *phydev_stats)
+{
+	ASSERT_RTNL();
+
+	if (!phylib_stubs)
+		return;
+
+	phylib_stubs->get_phy_stats(phydev, phy_stats, phydev_stats);
+}
+
+static inline void phy_ethtool_get_link_ext_stats(struct phy_device *phydev,
+				    struct ethtool_link_ext_stats *link_stats)
+{
+	ASSERT_RTNL();
+
+	if (!phylib_stubs)
+		return;
+
+	phylib_stubs->get_link_ext_stats(phydev, link_stats);
+}
+
 #else
 
 static inline int phy_hwtstamp_get(struct phy_device *phydev,
@@ -65,4 +96,15 @@ static inline int phy_hwtstamp_set(struct phy_device *phydev,
 	return -EOPNOTSUPP;
 }
 
+static inline void phy_ethtool_get_phy_stats(struct phy_device *phydev,
+					struct ethtool_eth_phy_stats *phy_stats,
+					struct ethtool_phy_stats *phydev_stats)
+{
+}
+
+static inline void phy_ethtool_get_link_ext_stats(struct phy_device *phydev,
+				    struct ethtool_link_ext_stats *link_stats)
+{
+}
+
 #endif
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 5462cc6a37dc..4b7a20620b49 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -393,6 +393,8 @@ struct phylink_pcs_ops;
 
 /**
  * struct phylink_pcs - PHYLINK PCS instance
+ * @supported_interfaces: describing which PHY_INTERFACE_MODE_xxx
+ *                        are supported by this PCS.
  * @ops: a pointer to the &struct phylink_pcs_ops structure
  * @phylink: pointer to &struct phylink_config
  * @neg_mode: provide PCS neg mode via "mode" argument
@@ -409,6 +411,7 @@ struct phylink_pcs_ops;
  * the PCS driver.
  */
 struct phylink_pcs {
+	DECLARE_PHY_INTERFACE_MASK(supported_interfaces);
 	const struct phylink_pcs_ops *ops;
 	struct phylink *phylink;
 	bool neg_mode;
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 3b9d132cbc9e..4bc2ee0b10b0 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -102,6 +102,7 @@ void __rtnl_net_unlock(struct net *net);
 void rtnl_net_lock(struct net *net);
 void rtnl_net_unlock(struct net *net);
 int rtnl_net_trylock(struct net *net);
+int rtnl_net_lock_killable(struct net *net);
 int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b);
 
 bool rtnl_net_is_locked(struct net *net);
@@ -138,6 +139,11 @@ static inline int rtnl_net_trylock(struct net *net)
 	return rtnl_trylock();
 }
 
+static inline int rtnl_net_lock_killable(struct net *net)
+{
+	return rtnl_lock_killable();
+}
+
 static inline void ASSERT_RTNL_NET(struct net *net)
 {
 	ASSERT_RTNL();
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 58337898fa21..f8f91b2038ea 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -546,4 +546,7 @@ int inet6_fill_ifmcaddr(struct sk_buff *skb,
 			const struct ifmcaddr6 *ifmca,
 			struct inet6_fill_args *args);
 
+int inet6_fill_ifacaddr(struct sk_buff *skb,
+			const struct ifacaddr6 *ifaca,
+			struct inet6_fill_args *args);
 #endif
diff --git a/include/net/ax25.h b/include/net/ax25.h
index cb622d84cd0c..4ee141aae0a2 100644
--- a/include/net/ax25.h
+++ b/include/net/ax25.h
@@ -231,6 +231,7 @@ typedef struct ax25_dev {
 #endif
 	refcount_t		refcount;
 	bool device_up;
+	struct rcu_head		rcu;
 } ax25_dev;
 
 typedef struct ax25_cb {
@@ -290,9 +291,8 @@ static inline void ax25_dev_hold(ax25_dev *ax25_dev)
 
 static inline void ax25_dev_put(ax25_dev *ax25_dev)
 {
-	if (refcount_dec_and_test(&ax25_dev->refcount)) {
-		kfree(ax25_dev);
-	}
+	if (refcount_dec_and_test(&ax25_dev->refcount))
+		kfree_rcu(ax25_dev, rcu);
 }
 static inline __be16 ax25_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
@@ -335,9 +335,9 @@ void ax25_digi_invert(const ax25_digi *, ax25_digi *);
 extern spinlock_t ax25_dev_lock;
 
 #if IS_ENABLED(CONFIG_AX25)
-static inline ax25_dev *ax25_dev_ax25dev(struct net_device *dev)
+static inline ax25_dev *ax25_dev_ax25dev(const struct net_device *dev)
 {
-	return dev->ax25_ptr;
+	return rcu_dereference_rtnl(dev->ax25_ptr);
 }
 #endif
 
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 4aeedb296d67..9640d5c67f56 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -991,8 +991,6 @@ struct dsa_switch_ops {
 	bool	(*support_eee)(struct dsa_switch *ds, int port);
 	int	(*set_mac_eee)(struct dsa_switch *ds, int port,
 			       struct ethtool_keee *e);
-	int	(*get_mac_eee)(struct dsa_switch *ds, int port,
-			       struct ethtool_keee *e);
 
 	/* EEPROM access */
 	int	(*get_eeprom_len)(struct dsa_switch *ds);
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 3c82fad904d4..c7f42844c79a 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -282,7 +282,7 @@ static inline int inet_csk_reqsk_queue_len(const struct sock *sk)
 
 static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
 {
-	return inet_csk_reqsk_queue_len(sk) >= READ_ONCE(sk->sk_max_ack_backlog);
+	return inet_csk_reqsk_queue_len(sk) > READ_ONCE(sk->sk_max_ack_backlog);
 }
 
 bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req);
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 3ccbad881d74..1086256549fa 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -19,6 +19,7 @@
 #include <linux/netdevice.h>
 
 #include <net/flow.h>
+#include <net/inet_dscp.h>
 #include <net/sock.h>
 #include <net/request_sock.h>
 #include <net/netns/hash.h>
@@ -302,6 +303,11 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet)
 	return READ_ONCE(inet->inet_flags) & IP_CMSG_ALL;
 }
 
+static inline dscp_t inet_sk_dscp(const struct inet_sock *inet)
+{
+	return inet_dsfield_to_dscp(READ_ONCE(inet->tos));
+}
+
 #define inet_test_bit(nr, sk)			\
 	test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
 #define inet_set_bit(nr, sk)			\
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 32c09e85a64c..4b0677e48190 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -38,6 +38,7 @@
 #define XFRM_PROTO_COMP		108
 #define XFRM_PROTO_IPIP		4
 #define XFRM_PROTO_IPV6		41
+#define XFRM_PROTO_IPTFS	IPPROTO_AGGFRAG
 #define XFRM_PROTO_ROUTING	IPPROTO_ROUTING
 #define XFRM_PROTO_DSTOPTS	IPPROTO_DSTOPTS
 
@@ -213,6 +214,7 @@ struct xfrm_state {
 		u16		family;
 		xfrm_address_t	saddr;
 		int		header_len;
+		int		enc_hdr_len;
 		int		trailer_len;
 		u32		extra_flags;
 		struct xfrm_mark	smark;
@@ -303,6 +305,9 @@ struct xfrm_state {
 	 * interpreted by xfrm_type methods. */
 	void			*data;
 	u8			dir;
+
+	const struct xfrm_mode_cbs	*mode_cbs;
+	void				*mode_data;
 };
 
 static inline struct net *xs_net(struct xfrm_state *x)
@@ -460,6 +465,45 @@ struct xfrm_type_offload {
 int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family);
 void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family);
 
+/**
+ * struct xfrm_mode_cbs - XFRM mode callbacks
+ * @owner: module owner or NULL
+ * @init_state: Add/init mode specific state in `xfrm_state *x`
+ * @clone_state: Copy mode specific values from `orig` to new state `x`
+ * @destroy_state: Cleanup mode specific state from `xfrm_state *x`
+ * @user_init: Process mode specific netlink attributes from user
+ * @copy_to_user: Add netlink attributes to `attrs` based on state in `x`
+ * @sa_len: Return space required to store mode specific netlink attributes
+ * @get_inner_mtu: Return avail payload space after removing encap overhead
+ * @input: Process received packet from SA using mode
+ * @output: Output given packet using mode
+ * @prepare_output: Add mode specific encapsulation to packet in skb. On return
+ *	`transport_header` should point at ESP header, `network_header` should
+ *	point at outer IP header and `mac_header` should opint at the
+ *	protocol/nexthdr field of the outer IP.
+ *
+ * One should examine and understand the specific uses of these callbacks in
+ * xfrm for further detail on how and when these functions are called. RTSL.
+ */
+struct xfrm_mode_cbs {
+	struct module	*owner;
+	int	(*init_state)(struct xfrm_state *x);
+	int	(*clone_state)(struct xfrm_state *x, struct xfrm_state *orig);
+	void	(*destroy_state)(struct xfrm_state *x);
+	int	(*user_init)(struct net *net, struct xfrm_state *x,
+			     struct nlattr **attrs,
+			     struct netlink_ext_ack *extack);
+	int	(*copy_to_user)(struct xfrm_state *x, struct sk_buff *skb);
+	unsigned int (*sa_len)(const struct xfrm_state *x);
+	u32	(*get_inner_mtu)(struct xfrm_state *x, int outer_mtu);
+	int	(*input)(struct xfrm_state *x, struct sk_buff *skb);
+	int	(*output)(struct net *net, struct sock *sk, struct sk_buff *skb);
+	int	(*prepare_output)(struct xfrm_state *x, struct sk_buff *skb);
+};
+
+int xfrm_register_mode_cbs(u8 mode, const struct xfrm_mode_cbs *mode_cbs);
+void xfrm_unregister_mode_cbs(u8 mode);
+
 static inline int xfrm_af2proto(unsigned int family)
 {
 	switch(family) {
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 7e1b3820f91f..d1089b88efc7 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -681,6 +681,7 @@ enum ethtool_link_ext_substate_module {
  * @ETH_SS_STATS_ETH_MAC: names of IEEE 802.3 MAC statistics
  * @ETH_SS_STATS_ETH_CTRL: names of IEEE 802.3 MAC Control statistics
  * @ETH_SS_STATS_RMON: names of RMON statistics
+ * @ETH_SS_STATS_PHY: names of PHY(dev) statistics
  *
  * @ETH_SS_COUNT: number of defined string sets
  */
@@ -706,6 +707,7 @@ enum ethtool_stringset {
 	ETH_SS_STATS_ETH_MAC,
 	ETH_SS_STATS_ETH_CTRL,
 	ETH_SS_STATS_RMON,
+	ETH_SS_STATS_PHY,
 
 	/* add new constants above here */
 	ETH_SS_COUNT
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 9c909ce733a5..9ff72cfb2e98 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -99,6 +99,7 @@ enum {
 	ETHTOOL_STATS_ETH_MAC,
 	ETHTOOL_STATS_ETH_CTRL,
 	ETHTOOL_STATS_RMON,
+	ETHTOOL_STATS_PHY,
 
 	/* add new constants above here */
 	__ETHTOOL_STATS_CNT
@@ -193,6 +194,19 @@ enum {
 	ETHTOOL_A_STATS_RMON_MAX = (__ETHTOOL_A_STATS_RMON_CNT - 1)
 };
 
+enum {
+	/* Basic packet counters if PHY has separate counters from the MAC */
+	ETHTOOL_A_STATS_PHY_RX_PKTS,
+	ETHTOOL_A_STATS_PHY_RX_BYTES,
+	ETHTOOL_A_STATS_PHY_RX_ERRORS,
+	ETHTOOL_A_STATS_PHY_TX_PKTS,
+	ETHTOOL_A_STATS_PHY_TX_BYTES,
+	ETHTOOL_A_STATS_PHY_TX_ERRORS,
+
+	/* add new constants above here */
+	__ETHTOOL_A_STATS_PHY_CNT,
+	ETHTOOL_A_STATS_PHY_MAX = (__ETHTOOL_A_STATS_PHY_CNT - 1)
+};
 
 /* generic netlink info */
 #define ETHTOOL_GENL_NAME "ethtool"
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 77730c340c8f..bfe880fbbb24 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1315,6 +1315,8 @@ enum {
 	IFLA_NETKIT_MODE,
 	IFLA_NETKIT_SCRUB,
 	IFLA_NETKIT_PEER_SCRUB,
+	IFLA_NETKIT_HEADROOM,
+	IFLA_NETKIT_TAILROOM,
 	__IFLA_NETKIT_MAX,
 };
 #define IFLA_NETKIT_MAX	(__IFLA_NETKIT_MAX - 1)
diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h
index 5d32d53508d9..ced0fc3c3aa5 100644
--- a/include/uapi/linux/in.h
+++ b/include/uapi/linux/in.h
@@ -79,6 +79,8 @@ enum {
 #define IPPROTO_MPLS		IPPROTO_MPLS
   IPPROTO_ETHERNET = 143,	/* Ethernet-within-IPv6 Encapsulation	*/
 #define IPPROTO_ETHERNET	IPPROTO_ETHERNET
+  IPPROTO_AGGFRAG = 144,	/* AGGFRAG in ESP (RFC 9347)		*/
+#define IPPROTO_AGGFRAG		IPPROTO_AGGFRAG
   IPPROTO_RAW = 255,		/* Raw IP packets			*/
 #define IPPROTO_RAW		IPPROTO_RAW
   IPPROTO_SMC = 256,		/* Shared Memory Communications		*/
diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index 283dec7e3645..5bd7ce934d74 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -137,6 +137,22 @@ struct ip_beet_phdr {
 	__u8 reserved;
 };
 
+struct ip_iptfs_hdr {
+	__u8 subtype;		/* 0*: basic, 1: CC */
+	__u8 flags;
+	__be16 block_offset;
+};
+
+struct ip_iptfs_cc_hdr {
+	__u8 subtype;		/* 0: basic, 1*: CC */
+	__u8 flags;
+	__be16 block_offset;
+	__be32 loss_rate;
+	__be64 rtt_adelay_xdelay;
+	__be32 tval;
+	__be32 techo;
+};
+
 /* index values for the variables in ipv4_devconf */
 enum
 {
diff --git a/include/uapi/linux/ipsec.h b/include/uapi/linux/ipsec.h
index 50d8ee1791e2..696b790f4346 100644
--- a/include/uapi/linux/ipsec.h
+++ b/include/uapi/linux/ipsec.h
@@ -14,7 +14,8 @@ enum {
 	IPSEC_MODE_ANY		= 0,	/* We do not support this for SA */
 	IPSEC_MODE_TRANSPORT	= 1,
 	IPSEC_MODE_TUNNEL	= 2,
-	IPSEC_MODE_BEET         = 3
+	IPSEC_MODE_BEET         = 3,
+	IPSEC_MODE_IPTFS        = 4
 };
 
 enum {
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 5ee94c511a28..66c3903d29cf 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -100,7 +100,11 @@ enum {
 	RTM_GETMULTICAST,
 #define RTM_GETMULTICAST RTM_GETMULTICAST
 
-	RTM_GETANYCAST	= 62,
+	RTM_NEWANYCAST	= 60,
+#define RTM_NEWANYCAST RTM_NEWANYCAST
+	RTM_DELANYCAST,
+#define RTM_DELANYCAST RTM_DELANYCAST
+	RTM_GETANYCAST,
 #define RTM_GETANYCAST	RTM_GETANYCAST
 
 	RTM_NEWNEIGHTBL	= 64,
@@ -783,6 +787,8 @@ enum rtnetlink_groups {
 #define RTNLGRP_IPV4_MCADDR	RTNLGRP_IPV4_MCADDR
 	RTNLGRP_IPV6_MCADDR,
 #define RTNLGRP_IPV6_MCADDR	RTNLGRP_IPV6_MCADDR
+	RTNLGRP_IPV6_ACADDR,
+#define RTNLGRP_IPV6_ACADDR	RTNLGRP_IPV6_ACADDR
 	__RTNLGRP_MAX
 };
 #define RTNLGRP_MAX	(__RTNLGRP_MAX - 1)
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 51da2e00112d..2e75674e7d4f 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -339,6 +339,8 @@ enum
 	LINUX_MIB_XFRMACQUIREERROR,		/* XfrmAcquireError */
 	LINUX_MIB_XFRMOUTSTATEDIRERROR,		/* XfrmOutStateDirError */
 	LINUX_MIB_XFRMINSTATEDIRERROR,		/* XfrmInStateDirError */
+	LINUX_MIB_XFRMINIPTFSERROR,		/* XfrmInIptfsError */
+	LINUX_MIB_XFRMOUTNOQSPACE,		/* XfrmOutNoQueueSpace */
 	__LINUX_MIB_XFRMMAX
 };
 
diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index d73a97e3030a..a23495c0e0a1 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -158,7 +158,8 @@ enum {
 #define XFRM_MODE_ROUTEOPTIMIZATION 2
 #define XFRM_MODE_IN_TRIGGER 3
 #define XFRM_MODE_BEET 4
-#define XFRM_MODE_MAX 5
+#define XFRM_MODE_IPTFS 5
+#define XFRM_MODE_MAX 6
 
 /* Netlink configuration messages.  */
 enum {
@@ -323,6 +324,12 @@ enum xfrm_attr_type_t {
 	XFRMA_SA_DIR,		/* __u8 */
 	XFRMA_NAT_KEEPALIVE_INTERVAL,	/* __u32 in seconds for NAT keepalive */
 	XFRMA_SA_PCPU,		/* __u32 */
+	XFRMA_IPTFS_DROP_TIME,	/* __u32 in: usec to wait for next seq */
+	XFRMA_IPTFS_REORDER_WINDOW, /* __u16 in: reorder window size (pkts) */
+	XFRMA_IPTFS_DONT_FRAG,	/* out: don't use fragmentation */
+	XFRMA_IPTFS_INIT_DELAY,	/* __u32 out: initial packet wait delay (usec) */
+	XFRMA_IPTFS_MAX_QSIZE,	/* __u32 out: max ingress queue size (octets) */
+	XFRMA_IPTFS_PKT_SIZE,	/* __u32 out: size of outer packet, 0 for PMTU */
 	__XFRMA_MAX
 
 #define XFRMA_OUTPUT_MARK XFRMA_SET_MARK	/* Compatibility */
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index d650ae6b58d3..74e5b9960c54 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -329,7 +329,6 @@ struct ufs_pwr_mode_info {
  * @program_key: program or evict an inline encryption key
  * @fill_crypto_prdt: initialize crypto-related fields in the PRDT
  * @event_notify: called to notify important events
- * @reinit_notify: called to notify reinit of UFSHCD during max gear switch
  * @mcq_config_resource: called to configure MCQ platform resources
  * @get_hba_mac: reports maximum number of outstanding commands supported by
  *	the controller. Should be implemented for UFSHCI 4.0 or later
@@ -381,7 +380,6 @@ struct ufs_hba_variant_ops {
 				    void *prdt, unsigned int num_segments);
 	void	(*event_notify)(struct ufs_hba *hba,
 				enum ufs_event_type evt, void *data);
-	void	(*reinit_notify)(struct ufs_hba *);
 	int	(*mcq_config_resource)(struct ufs_hba *hba);
 	int	(*get_hba_mac)(struct ufs_hba *hba);
 	int	(*op_runtime_config)(struct ufs_hba *hba);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index d3403c8216db..ff691f37462c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -320,7 +320,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct io_async_rw));
 	ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX,
-			    sizeof(struct uring_cache));
+			    sizeof(struct io_uring_cmd_data));
 	spin_lock_init(&ctx->msg_lock);
 	ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct io_kiocb));
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index d407576ddfb7..eec5eb7de843 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -139,6 +139,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 	struct io_uring_buf_ring *br = bl->buf_ring;
 	__u16 tail, head = bl->head;
 	struct io_uring_buf *buf;
+	void __user *ret;
 
 	tail = smp_load_acquire(&br->tail);
 	if (unlikely(tail == head))
@@ -153,6 +154,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 	req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
 	req->buf_list = bl;
 	req->buf_index = buf->bid;
+	ret = u64_to_user_ptr(buf->addr);
 
 	if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) {
 		/*
@@ -168,7 +170,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 		io_kbuf_commit(req, bl, *len, 1);
 		req->buf_list = NULL;
 	}
-	return u64_to_user_ptr(buf->addr);
+	return ret;
 }
 
 void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
diff --git a/io_uring/net.c b/io_uring/net.c
index df1f7dc6f1c8..c6cd38cc5dc4 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -754,6 +754,7 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req)
 	if (req->opcode == IORING_OP_RECV) {
 		kmsg->msg.msg_name = NULL;
 		kmsg->msg.msg_namelen = 0;
+		kmsg->msg.msg_inq = 0;
 		kmsg->msg.msg_control = NULL;
 		kmsg->msg.msg_get_inq = 1;
 		kmsg->msg.msg_controllen = 0;
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 3de75eca1c92..e8baef4e5146 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/io_uring.h>
+#include <linux/io_uring/cmd.h>
 
 #include "io_uring.h"
 #include "opdef.h"
@@ -414,7 +415,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.plug			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
-		.async_size		= 2 * sizeof(struct io_uring_sqe),
+		.async_size		= sizeof(struct io_uring_cmd_data),
 		.prep			= io_uring_cmd_prep,
 		.issue			= io_uring_cmd,
 	},
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 0bcb83e4ce3c..29bb3010f9c0 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -983,6 +983,8 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
 		io_kbuf_recycle(req, issue_flags);
 		if (ret < 0)
 			req_set_fail(req);
+	} else if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
+		cflags = io_put_kbuf(req, ret, issue_flags);
 	} else {
 		/*
 		 * Any successful return value will keep the multishot read
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index bbe58638eca7..362689b17ccc 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -85,7 +85,27 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts)
 	io_req_task_complete(req, ts);
 }
 
-static bool io_kill_timeout(struct io_kiocb *req, int status)
+static __cold bool io_flush_killed_timeouts(struct list_head *list, int err)
+{
+	if (list_empty(list))
+		return false;
+
+	while (!list_empty(list)) {
+		struct io_timeout *timeout;
+		struct io_kiocb *req;
+
+		timeout = list_first_entry(list, struct io_timeout, list);
+		list_del_init(&timeout->list);
+		req = cmd_to_io_kiocb(timeout);
+		if (err)
+			req_set_fail(req);
+		io_req_queue_tw_complete(req, err);
+	}
+
+	return true;
+}
+
+static void io_kill_timeout(struct io_kiocb *req, struct list_head *list)
 	__must_hold(&req->ctx->timeout_lock)
 {
 	struct io_timeout_data *io = req->async_data;
@@ -93,21 +113,17 @@ static bool io_kill_timeout(struct io_kiocb *req, int status)
 	if (hrtimer_try_to_cancel(&io->timer) != -1) {
 		struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 
-		if (status)
-			req_set_fail(req);
 		atomic_set(&req->ctx->cq_timeouts,
 			atomic_read(&req->ctx->cq_timeouts) + 1);
-		list_del_init(&timeout->list);
-		io_req_queue_tw_complete(req, status);
-		return true;
+		list_move_tail(&timeout->list, list);
 	}
-	return false;
 }
 
 __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
 {
-	u32 seq;
 	struct io_timeout *timeout, *tmp;
+	LIST_HEAD(list);
+	u32 seq;
 
 	raw_spin_lock_irq(&ctx->timeout_lock);
 	seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
@@ -131,10 +147,11 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
 		if (events_got < events_needed)
 			break;
 
-		io_kill_timeout(req, 0);
+		io_kill_timeout(req, &list);
 	}
 	ctx->cq_last_tm_flush = seq;
 	raw_spin_unlock_irq(&ctx->timeout_lock);
+	io_flush_killed_timeouts(&list, 0);
 }
 
 static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts)
@@ -661,7 +678,7 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx
 			     bool cancel_all)
 {
 	struct io_timeout *timeout, *tmp;
-	int canceled = 0;
+	LIST_HEAD(list);
 
 	/*
 	 * completion_lock is needed for io_match_task(). Take it before
@@ -672,11 +689,11 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx
 	list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) {
 		struct io_kiocb *req = cmd_to_io_kiocb(timeout);
 
-		if (io_match_task(req, tctx, cancel_all) &&
-		    io_kill_timeout(req, -ECANCELED))
-			canceled++;
+		if (io_match_task(req, tctx, cancel_all))
+			io_kill_timeout(req, &list);
 	}
 	raw_spin_unlock_irq(&ctx->timeout_lock);
 	spin_unlock(&ctx->completion_lock);
-	return canceled != 0;
+
+	return io_flush_killed_timeouts(&list, -ECANCELED);
 }
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index af842e9b4eb9..ce7726a04883 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -16,26 +16,35 @@
 #include "rsrc.h"
 #include "uring_cmd.h"
 
-static struct uring_cache *io_uring_async_get(struct io_kiocb *req)
+static struct io_uring_cmd_data *io_uring_async_get(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct uring_cache *cache;
+	struct io_uring_cmd_data *cache;
 
 	cache = io_alloc_cache_get(&ctx->uring_cache);
 	if (cache) {
+		cache->op_data = NULL;
 		req->flags |= REQ_F_ASYNC_DATA;
 		req->async_data = cache;
 		return cache;
 	}
-	if (!io_alloc_async_data(req))
-		return req->async_data;
+	if (!io_alloc_async_data(req)) {
+		cache = req->async_data;
+		cache->op_data = NULL;
+		return cache;
+	}
 	return NULL;
 }
 
 static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
-	struct uring_cache *cache = req->async_data;
+	struct io_uring_cmd_data *cache = req->async_data;
+
+	if (cache->op_data) {
+		kfree(cache->op_data);
+		cache->op_data = NULL;
+	}
 
 	if (issue_flags & IO_URING_F_UNLOCKED)
 		return;
@@ -183,7 +192,7 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req,
 				   const struct io_uring_sqe *sqe)
 {
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
-	struct uring_cache *cache;
+	struct io_uring_cmd_data *cache;
 
 	cache = io_uring_async_get(req);
 	if (unlikely(!cache))
@@ -260,7 +269,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 
 	ret = file->f_op->uring_cmd(ioucmd, issue_flags);
 	if (ret == -EAGAIN) {
-		struct uring_cache *cache = req->async_data;
+		struct io_uring_cmd_data *cache = req->async_data;
 
 		if (ioucmd->sqe != (void *) cache)
 			memcpy(cache, ioucmd->sqe, uring_sqe_size(req->ctx));
diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h
index 7dba0f1efc58..f6837ee0955b 100644
--- a/io_uring/uring_cmd.h
+++ b/io_uring/uring_cmd.h
@@ -1,9 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 
-struct uring_cache {
-	struct io_uring_sqe sqes[2];
-};
-
 int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags);
 int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index 383fd43ac612..7e1340da5aca 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -89,6 +89,7 @@ find $cpio_dir -type f -print0 |
 
 # Create archive and try to normalize metadata for reproducibility.
 tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \
+    --exclude=".__afs*" --exclude=".nfs*" \
     --owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \
     -I $XZ -cf $tarfile -C $cpio_dir/ . > /dev/null
 
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 28a6be6e64fd..187ba1b80bda 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -166,7 +166,7 @@ static void kcov_remote_area_put(struct kcov_remote_area *area,
  * Unlike in_serving_softirq(), this function returns false when called during
  * a hardirq or an NMI that happened in the softirq context.
  */
-static inline bool in_softirq_really(void)
+static __always_inline bool in_softirq_really(void)
 {
 	return in_serving_softirq() && !in_hardirq() && !in_nmi();
 }
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7fff1d045477..19d2699cf638 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4763,7 +4763,7 @@ static void scx_ops_bypass(bool bypass)
 		 * sees scx_rq_bypassing() before moving tasks to SCX.
 		 */
 		if (!scx_enabled()) {
-			rq_unlock_irqrestore(rq, &rf);
+			rq_unlock(rq, &rf);
 			continue;
 		}
 
@@ -7013,7 +7013,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
 		return -ENOENT;
 
 	INIT_LIST_HEAD(&kit->cursor.node);
-	kit->cursor.flags |= SCX_DSQ_LNODE_ITER_CURSOR | flags;
+	kit->cursor.flags = SCX_DSQ_LNODE_ITER_CURSOR | flags;
 	kit->cursor.priv = READ_ONCE(kit->dsq->seq);
 
 	return 0;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8b07576814a5..f7d8fc204579 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3680,23 +3680,27 @@ void workqueue_softirq_dead(unsigned int cpu)
  * check_flush_dependency - check for flush dependency sanity
  * @target_wq: workqueue being flushed
  * @target_work: work item being flushed (NULL for workqueue flushes)
+ * @from_cancel: are we called from the work cancel path
  *
  * %current is trying to flush the whole @target_wq or @target_work on it.
- * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not
- * reclaiming memory or running on a workqueue which doesn't have
- * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to
- * a deadlock.
+ * If this is not the cancel path (which implies work being flushed is either
+ * already running, or will not be at all), check if @target_wq doesn't have
+ * %WQ_MEM_RECLAIM and verify that %current is not reclaiming memory or running
+ * on a workqueue which doesn't have %WQ_MEM_RECLAIM as that can break forward-
+ * progress guarantee leading to a deadlock.
  */
 static void check_flush_dependency(struct workqueue_struct *target_wq,
-				   struct work_struct *target_work)
+				   struct work_struct *target_work,
+				   bool from_cancel)
 {
-	work_func_t target_func = target_work ? target_work->func : NULL;
+	work_func_t target_func;
 	struct worker *worker;
 
-	if (target_wq->flags & WQ_MEM_RECLAIM)
+	if (from_cancel || target_wq->flags & WQ_MEM_RECLAIM)
 		return;
 
 	worker = current_wq_worker();
+	target_func = target_work ? target_work->func : NULL;
 
 	WARN_ONCE(current->flags & PF_MEMALLOC,
 		  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps",
@@ -3980,7 +3984,7 @@ void __flush_workqueue(struct workqueue_struct *wq)
 		list_add_tail(&this_flusher.list, &wq->flusher_overflow);
 	}
 
-	check_flush_dependency(wq, NULL);
+	check_flush_dependency(wq, NULL, false);
 
 	mutex_unlock(&wq->mutex);
 
@@ -4155,7 +4159,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
 	}
 
 	wq = pwq->wq;
-	check_flush_dependency(wq, work);
+	check_flush_dependency(wq, work, from_cancel);
 
 	insert_wq_barrier(pwq, barr, work, worker);
 	raw_spin_unlock_irq(&pool->lock);
@@ -5641,6 +5645,7 @@ static void wq_adjust_max_active(struct workqueue_struct *wq)
 	} while (activated);
 }
 
+__printf(1, 0)
 static struct workqueue_struct *__alloc_workqueue(const char *fmt,
 						  unsigned int flags,
 						  int max_active, va_list args)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index d0ae808f3a14..047397136f15 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -4354,6 +4354,7 @@ int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp,
 		ret = 1;
 	}
 	if (ret < 0 && range_lo > min) {
+		mas_reset(mas);
 		ret = mas_empty_area(mas, min, range_hi, 1);
 		if (ret == 0)
 			ret = 1;
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 8b8e2933dcd4..0776452a1abb 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -868,6 +868,11 @@ static int damon_commit_schemes(struct damon_ctx *dst, struct damon_ctx *src)
 				NUMA_NO_NODE);
 		if (!new_scheme)
 			return -ENOMEM;
+		err = damos_commit(new_scheme, src_scheme);
+		if (err) {
+			damon_destroy_scheme(new_scheme);
+			return err;
+		}
 		damon_add_scheme(dst, new_scheme);
 	}
 	return 0;
@@ -961,8 +966,11 @@ static int damon_commit_targets(
 			return -ENOMEM;
 		err = damon_commit_target(new_target, false,
 				src_target, damon_target_has_pid(src));
-		if (err)
+		if (err) {
+			damon_destroy_target(new_target);
 			return err;
+		}
+		damon_add_target(dst, new_target);
 	}
 	return 0;
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index f61cf51c2238..33b60d448fca 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -124,15 +124,6 @@
  *    ->private_lock		(zap_pte_range->block_dirty_folio)
  */
 
-static void mapping_set_update(struct xa_state *xas,
-		struct address_space *mapping)
-{
-	if (dax_mapping(mapping) || shmem_mapping(mapping))
-		return;
-	xas_set_update(xas, workingset_update_node);
-	xas_set_lru(xas, &shadow_nodes);
-}
-
 static void page_cache_delete(struct address_space *mapping,
 				   struct folio *folio, void *shadow)
 {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cec4b121193f..c498874a7170 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7211,7 +7211,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
 			spte = hugetlb_walk(svma, saddr,
 					    vma_mmu_pagesize(svma));
 			if (spte) {
-				get_page(virt_to_page(spte));
+				ptdesc_pmd_pts_inc(virt_to_ptdesc(spte));
 				break;
 			}
 		}
@@ -7226,7 +7226,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
 				(pmd_t *)((unsigned long)spte & PAGE_MASK));
 		mm_inc_nr_pmds(mm);
 	} else {
-		put_page(virt_to_page(spte));
+		ptdesc_pmd_pts_dec(virt_to_ptdesc(spte));
 	}
 	spin_unlock(&mm->page_table_lock);
 out:
@@ -7238,10 +7238,6 @@ out:
 /*
  * unmap huge page backed by shared pte.
  *
- * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
- * indicated by page_count > 1, unmap is achieved by clearing pud and
- * decrementing the ref count. If count == 1, the pte page is not shared.
- *
  * Called with page table lock held.
  *
  * returns: 1 successfully unmapped a shared pte page
@@ -7250,18 +7246,20 @@ out:
 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 					unsigned long addr, pte_t *ptep)
 {
+	unsigned long sz = huge_page_size(hstate_vma(vma));
 	pgd_t *pgd = pgd_offset(mm, addr);
 	p4d_t *p4d = p4d_offset(pgd, addr);
 	pud_t *pud = pud_offset(p4d, addr);
 
 	i_mmap_assert_write_locked(vma->vm_file->f_mapping);
 	hugetlb_vma_assert_locked(vma);
-	BUG_ON(page_count(virt_to_page(ptep)) == 0);
-	if (page_count(virt_to_page(ptep)) == 1)
+	if (sz != PMD_SIZE)
+		return 0;
+	if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep)))
 		return 0;
 
 	pud_clear(pud);
-	put_page(virt_to_page(ptep));
+	ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
 	mm_dec_nr_pmds(mm);
 	return 1;
 }
diff --git a/mm/internal.h b/mm/internal.h
index 3bd08bafad04..9826f7dce607 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1504,6 +1504,12 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
 /* Only track the nodes of mappings with shadow entries */
 void workingset_update_node(struct xa_node *node);
 extern struct list_lru shadow_nodes;
+#define mapping_set_update(xas, mapping) do {			\
+	if (!dax_mapping(mapping) && !shmem_mapping(mapping)) {	\
+		xas_set_update(xas, workingset_update_node);	\
+		xas_set_lru(xas, &shadow_nodes);		\
+	}							\
+} while (0)
 
 /* mremap.c */
 unsigned long move_page_tables(struct vm_area_struct *vma,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6f8d46d107b4..653dbb1ff05c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -19,6 +19,7 @@
 #include <linux/rcupdate_wait.h>
 #include <linux/swapops.h>
 #include <linux/shmem_fs.h>
+#include <linux/dax.h>
 #include <linux/ksm.h>
 
 #include <asm/tlb.h>
@@ -1837,6 +1838,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
 	if (result != SCAN_SUCCEED)
 		goto out;
 
+	mapping_set_update(&xas, mapping);
+
 	__folio_set_locked(new_folio);
 	if (is_shmem)
 		__folio_set_swapbacked(new_folio);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 2a945c07ae99..737af23f4f4e 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -373,7 +373,7 @@ static void print_unreferenced(struct seq_file *seq,
 
 	for (i = 0; i < nr_entries; i++) {
 		void *ptr = (void *)entries[i];
-		warn_or_seq_printf(seq, "    [<%pK>] %pS\n", ptr, ptr);
+		warn_or_seq_printf(seq, "    %pS\n", ptr);
 	}
 }
 
diff --git a/mm/list_lru.c b/mm/list_lru.c
index f93ada6a207b..7d69434c70e0 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -77,7 +77,6 @@ again:
 			spin_lock(&l->lock);
 		nr_items = READ_ONCE(l->nr_items);
 		if (likely(nr_items != LONG_MIN)) {
-			WARN_ON(nr_items < 0);
 			rcu_read_unlock();
 			return l;
 		}
@@ -450,6 +449,7 @@ static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid,
 
 	list_splice_init(&src->list, &dst->list);
 	if (src->nr_items) {
+		WARN_ON(src->nr_items < 0);
 		dst->nr_items += src->nr_items;
 		set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
 	}
diff --git a/mm/memfd.c b/mm/memfd.c
index c17c3ea701a1..35a370d75c9a 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -170,7 +170,7 @@ static int memfd_wait_for_pins(struct address_space *mapping)
 	return error;
 }
 
-static unsigned int *memfd_file_seals_ptr(struct file *file)
+unsigned int *memfd_file_seals_ptr(struct file *file)
 {
 	if (shmem_file(file))
 		return &SHMEM_I(file_inode(file))->seals;
diff --git a/mm/mmap.c b/mm/mmap.c
index d32b7e701058..aec208f90337 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -47,6 +47,7 @@
 #include <linux/oom.h>
 #include <linux/sched/mm.h>
 #include <linux/ksm.h>
+#include <linux/memfd.h>
 
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
@@ -368,6 +369,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 
 	if (file) {
 		struct inode *inode = file_inode(file);
+		unsigned int seals = memfd_file_seals(file);
 		unsigned long flags_mask;
 
 		if (!file_mmap_ok(file, inode, pgoff, len))
@@ -408,6 +410,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 			vm_flags |= VM_SHARED | VM_MAYSHARE;
 			if (!(file->f_mode & FMODE_WRITE))
 				vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
+			else if (is_readonly_sealed(seals, vm_flags))
+				vm_flags &= ~VM_MAYWRITE;
 			fallthrough;
 		case MAP_PRIVATE:
 			if (!(file->f_mode & FMODE_READ))
@@ -888,7 +892,7 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 
 	if (get_area) {
 		addr = get_area(file, addr, len, pgoff, flags);
-	} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)
+	} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && !file
 		   && !addr /* no hint */
 		   && IS_ALIGNED(len, PMD_SIZE)) {
 		/* Ensures that larger anonymous mappings are THP aligned. */
diff --git a/mm/readahead.c b/mm/readahead.c
index ea650b8b02fb..e151f4b13ca4 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -646,7 +646,11 @@ void page_cache_async_ra(struct readahead_control *ractl,
 			1UL << order);
 	if (index == expected) {
 		ra->start += ra->size;
-		ra->size = get_next_ra_size(ra, max_pages);
+		/*
+		 * In the case of MADV_HUGEPAGE, the actual size might exceed
+		 * the readahead window.
+		 */
+		ra->size = max(ra->size, get_next_ra_size(ra, max_pages));
 		ra->async_size = ra->size;
 		goto readit;
 	}
diff --git a/mm/shmem.c b/mm/shmem.c
index f6fb053ac50d..ac58d4fb2e6f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1535,7 +1535,7 @@ try_split:
 			    !shmem_falloc->waitq &&
 			    index >= shmem_falloc->start &&
 			    index < shmem_falloc->next)
-				shmem_falloc->nr_unswapped++;
+				shmem_falloc->nr_unswapped += nr_pages;
 			else
 				shmem_falloc = NULL;
 			spin_unlock(&inode->i_lock);
@@ -1689,6 +1689,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
 	unsigned long mask = READ_ONCE(huge_shmem_orders_always);
 	unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size);
 	unsigned long vm_flags = vma ? vma->vm_flags : 0;
+	pgoff_t aligned_index;
 	bool global_huge;
 	loff_t i_size;
 	int order;
@@ -1723,9 +1724,9 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
 	/* Allow mTHP that will be fully within i_size. */
 	order = highest_order(within_size_orders);
 	while (within_size_orders) {
-		index = round_up(index + 1, order);
+		aligned_index = round_up(index + 1, 1 << order);
 		i_size = round_up(i_size_read(inode), PAGE_SIZE);
-		if (i_size >> PAGE_SHIFT >= index) {
+		if (i_size >> PAGE_SHIFT >= aligned_index) {
 			mask |= within_size_orders;
 			break;
 		}
diff --git a/mm/util.c b/mm/util.c
index c1c3b06ab4f9..60aa40f612b8 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -297,12 +297,7 @@ void *memdup_user_nul(const void __user *src, size_t len)
 {
 	char *p;
 
-	/*
-	 * Always use GFP_KERNEL, since copy_from_user() can sleep and
-	 * cause pagefault, which makes it pointless to use GFP_NOFS
-	 * or GFP_ATOMIC.
-	 */
-	p = kmalloc_track_caller(len + 1, GFP_KERNEL);
+	p = kmem_buckets_alloc_track_caller(user_buckets, len + 1, GFP_USER | __GFP_NOWARN);
 	if (!p)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 76378bc257e3..9a859b7d18d7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -374,7 +374,14 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
 	if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
 		nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
 			zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
-
+	/*
+	 * If there are no reclaimable file-backed or anonymous pages,
+	 * ensure zones with sufficient free pages are not skipped.
+	 * This prevents zones like DMA32 from being ignored in reclaim
+	 * scenarios where they can still help alleviate memory pressure.
+	 */
+	if (nr == 0)
+		nr = zone_page_state_snapshot(zone, NR_FREE_PAGES);
 	return nr;
 }
 
diff --git a/mm/zswap.c b/mm/zswap.c
index f6316b66fb23..5a27af8d86ea 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -880,6 +880,18 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
 	return 0;
 }
 
+/* Prevent CPU hotplug from freeing up the per-CPU acomp_ctx resources */
+static struct crypto_acomp_ctx *acomp_ctx_get_cpu(struct crypto_acomp_ctx __percpu *acomp_ctx)
+{
+	cpus_read_lock();
+	return raw_cpu_ptr(acomp_ctx);
+}
+
+static void acomp_ctx_put_cpu(void)
+{
+	cpus_read_unlock();
+}
+
 static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 			   struct zswap_pool *pool)
 {
@@ -893,8 +905,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 	gfp_t gfp;
 	u8 *dst;
 
-	acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
-
+	acomp_ctx = acomp_ctx_get_cpu(pool->acomp_ctx);
 	mutex_lock(&acomp_ctx->mutex);
 
 	dst = acomp_ctx->buffer;
@@ -950,6 +961,7 @@ unlock:
 		zswap_reject_alloc_fail++;
 
 	mutex_unlock(&acomp_ctx->mutex);
+	acomp_ctx_put_cpu();
 	return comp_ret == 0 && alloc_ret == 0;
 }
 
@@ -960,7 +972,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
 	struct crypto_acomp_ctx *acomp_ctx;
 	u8 *src;
 
-	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+	acomp_ctx = acomp_ctx_get_cpu(entry->pool->acomp_ctx);
 	mutex_lock(&acomp_ctx->mutex);
 
 	src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
@@ -990,6 +1002,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
 
 	if (src != acomp_ctx->buffer)
 		zpool_unmap_handle(zpool, entry->handle);
+	acomp_ctx_put_cpu();
 }
 
 /*********************************
diff --git a/net/802/psnap.c b/net/802/psnap.c
index fca9d454905f..389df460c8c4 100644
--- a/net/802/psnap.c
+++ b/net/802/psnap.c
@@ -55,11 +55,11 @@ static int snap_rcv(struct sk_buff *skb, struct net_device *dev,
 		goto drop;
 
 	rcu_read_lock();
-	proto = find_snap_client(skb_transport_header(skb));
+	proto = find_snap_client(skb->data);
 	if (proto) {
 		/* Pass the frame on. */
-		skb->transport_header += 5;
 		skb_pull_rcsum(skb, 5);
+		skb_reset_transport_header(skb);
 		rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev);
 	}
 	rcu_read_unlock();
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index d6f9fae06a9d..aa6c714892ec 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -467,7 +467,7 @@ einval_put:
 	goto out_put;
 }
 
-static void ax25_fillin_cb_from_dev(ax25_cb *ax25, ax25_dev *ax25_dev)
+static void ax25_fillin_cb_from_dev(ax25_cb *ax25, const ax25_dev *ax25_dev)
 {
 	ax25->rtt     = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T1]) / 2;
 	ax25->t1      = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T1]);
@@ -677,22 +677,22 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,
 			break;
 		}
 
-		rtnl_lock();
-		dev = __dev_get_by_name(&init_net, devname);
+		rcu_read_lock();
+		dev = dev_get_by_name_rcu(&init_net, devname);
 		if (!dev) {
-			rtnl_unlock();
+			rcu_read_unlock();
 			res = -ENODEV;
 			break;
 		}
 
 		ax25->ax25_dev = ax25_dev_ax25dev(dev);
 		if (!ax25->ax25_dev) {
-			rtnl_unlock();
+			rcu_read_unlock();
 			res = -ENODEV;
 			break;
 		}
 		ax25_fillin_cb(ax25, ax25->ax25_dev);
-		rtnl_unlock();
+		rcu_read_unlock();
 		break;
 
 	default:
diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c
index 9efd6690b344..3733c0254a50 100644
--- a/net/ax25/ax25_dev.c
+++ b/net/ax25/ax25_dev.c
@@ -90,7 +90,7 @@ void ax25_dev_device_up(struct net_device *dev)
 
 	spin_lock_bh(&ax25_dev_lock);
 	list_add(&ax25_dev->list, &ax25_dev_list);
-	dev->ax25_ptr     = ax25_dev;
+	rcu_assign_pointer(dev->ax25_ptr, ax25_dev);
 	spin_unlock_bh(&ax25_dev_lock);
 
 	ax25_register_dev_sysctl(ax25_dev);
@@ -125,7 +125,7 @@ void ax25_dev_device_down(struct net_device *dev)
 		}
 	}
 
-	dev->ax25_ptr = NULL;
+	RCU_INIT_POINTER(dev->ax25_ptr, NULL);
 	spin_unlock_bh(&ax25_dev_lock);
 	netdev_put(dev, &ax25_dev->dev_tracker);
 	ax25_dev_put(ax25_dev);
diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c
index 36249776c021..215d4ccf12b9 100644
--- a/net/ax25/ax25_ip.c
+++ b/net/ax25/ax25_ip.c
@@ -122,6 +122,7 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb)
 	if (dev == NULL)
 		dev = skb->dev;
 
+	rcu_read_lock();
 	if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) {
 		kfree_skb(skb);
 		goto put;
@@ -202,7 +203,7 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb)
 	ax25_queue_xmit(skb, dev);
 
 put:
-
+	rcu_read_unlock();
 	ax25_route_lock_unuse();
 	return NETDEV_TX_OK;
 }
diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c
index 3db76d2470e9..8bca2ace98e5 100644
--- a/net/ax25/ax25_out.c
+++ b/net/ax25/ax25_out.c
@@ -39,10 +39,14 @@ ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, const ax25_address *sr
 	 * specified.
 	 */
 	if (paclen == 0) {
-		if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
+		rcu_read_lock();
+		ax25_dev = ax25_dev_ax25dev(dev);
+		if (!ax25_dev) {
+			rcu_read_unlock();
 			return NULL;
-
+		}
 		paclen = ax25_dev->values[AX25_VALUES_PACLEN];
+		rcu_read_unlock();
 	}
 
 	/*
@@ -53,13 +57,19 @@ ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, const ax25_address *sr
 		return ax25;		/* It already existed */
 	}
 
-	if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
+	rcu_read_lock();
+	ax25_dev = ax25_dev_ax25dev(dev);
+	if (!ax25_dev) {
+		rcu_read_unlock();
 		return NULL;
+	}
 
-	if ((ax25 = ax25_create_cb()) == NULL)
+	if ((ax25 = ax25_create_cb()) == NULL) {
+		rcu_read_unlock();
 		return NULL;
-
+	}
 	ax25_fillin_cb(ax25, ax25_dev);
+	rcu_read_unlock();
 
 	ax25->source_addr = *src;
 	ax25->dest_addr   = *dest;
@@ -358,7 +368,9 @@ void ax25_queue_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	unsigned char *ptr;
 
+	rcu_read_lock();
 	skb->protocol = ax25_type_trans(skb, ax25_fwd_dev(dev));
+	rcu_read_unlock();
 
 	ptr  = skb_push(skb, 1);
 	*ptr = 0x00;			/* KISS */
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index b7c4d656a94b..69de75db0c9c 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -406,6 +406,7 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
 		ax25_route_lock_unuse();
 		return -EHOSTUNREACH;
 	}
+	rcu_read_lock();
 	if ((ax25->ax25_dev = ax25_dev_ax25dev(ax25_rt->dev)) == NULL) {
 		err = -EHOSTUNREACH;
 		goto put;
@@ -442,6 +443,7 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
 	}
 
 put:
+	rcu_read_unlock();
 	ax25_route_lock_unuse();
 	return err;
 }
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index c86f4e42e69c..7b2b04d6b856 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -1031,9 +1031,9 @@ static bool adv_use_rpa(struct hci_dev *hdev, uint32_t flags)
 
 static int hci_set_random_addr_sync(struct hci_dev *hdev, bdaddr_t *rpa)
 {
-	/* If we're advertising or initiating an LE connection we can't
-	 * go ahead and change the random address at this time. This is
-	 * because the eventual initiator address used for the
+	/* If a random_addr has been set we're advertising or initiating an LE
+	 * connection we can't go ahead and change the random address at this
+	 * time. This is because the eventual initiator address used for the
 	 * subsequently created connection will be undefined (some
 	 * controllers use the new address and others the one we had
 	 * when the operation started).
@@ -1041,8 +1041,9 @@ static int hci_set_random_addr_sync(struct hci_dev *hdev, bdaddr_t *rpa)
 	 * In this kind of scenario skip the update and let the random
 	 * address be updated at the next cycle.
 	 */
-	if (hci_dev_test_flag(hdev, HCI_LE_ADV) ||
-	    hci_lookup_le_connect(hdev)) {
+	if (bacmp(&hdev->random_addr, BDADDR_ANY) &&
+	    (hci_dev_test_flag(hdev, HCI_LE_ADV) ||
+	    hci_lookup_le_connect(hdev))) {
 		bt_dev_dbg(hdev, "Deferring random address update");
 		hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
 		return 0;
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index b31192d473d0..de47ad999d7b 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -7655,6 +7655,24 @@ static void device_added(struct sock *sk, struct hci_dev *hdev,
 	mgmt_event(MGMT_EV_DEVICE_ADDED, hdev, &ev, sizeof(ev), sk);
 }
 
+static void add_device_complete(struct hci_dev *hdev, void *data, int err)
+{
+	struct mgmt_pending_cmd *cmd = data;
+	struct mgmt_cp_add_device *cp = cmd->param;
+
+	if (!err) {
+		device_added(cmd->sk, hdev, &cp->addr.bdaddr, cp->addr.type,
+			     cp->action);
+		device_flags_changed(NULL, hdev, &cp->addr.bdaddr,
+				     cp->addr.type, hdev->conn_flags,
+				     PTR_UINT(cmd->user_data));
+	}
+
+	mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_ADD_DEVICE,
+			  mgmt_status(err), &cp->addr, sizeof(cp->addr));
+	mgmt_pending_free(cmd);
+}
+
 static int add_device_sync(struct hci_dev *hdev, void *data)
 {
 	return hci_update_passive_scan_sync(hdev);
@@ -7663,6 +7681,7 @@ static int add_device_sync(struct hci_dev *hdev, void *data)
 static int add_device(struct sock *sk, struct hci_dev *hdev,
 		      void *data, u16 len)
 {
+	struct mgmt_pending_cmd *cmd;
 	struct mgmt_cp_add_device *cp = data;
 	u8 auto_conn, addr_type;
 	struct hci_conn_params *params;
@@ -7743,9 +7762,24 @@ static int add_device(struct sock *sk, struct hci_dev *hdev,
 			current_flags = params->flags;
 	}
 
-	err = hci_cmd_sync_queue(hdev, add_device_sync, NULL, NULL);
-	if (err < 0)
+	cmd = mgmt_pending_new(sk, MGMT_OP_ADD_DEVICE, hdev, data, len);
+	if (!cmd) {
+		err = -ENOMEM;
 		goto unlock;
+	}
+
+	cmd->user_data = UINT_PTR(current_flags);
+
+	err = hci_cmd_sync_queue(hdev, add_device_sync, cmd,
+				 add_device_complete);
+	if (err < 0) {
+		err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE,
+					MGMT_STATUS_FAILED, &cp->addr,
+					sizeof(cp->addr));
+		mgmt_pending_free(cmd);
+	}
+
+	goto unlock;
 
 added:
 	device_added(sk, hdev, &cp->addr.bdaddr, cp->addr.type, cp->action);
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index af80d599c337..21a5b5535ebc 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -201,14 +201,14 @@ static ssize_t address_show(struct device *tty_dev,
 			    struct device_attribute *attr, char *buf)
 {
 	struct rfcomm_dev *dev = dev_get_drvdata(tty_dev);
-	return sprintf(buf, "%pMR\n", &dev->dst);
+	return sysfs_emit(buf, "%pMR\n", &dev->dst);
 }
 
 static ssize_t channel_show(struct device *tty_dev,
 			    struct device_attribute *attr, char *buf)
 {
 	struct rfcomm_dev *dev = dev_get_drvdata(tty_dev);
-	return sprintf(buf, "%d\n", dev->channel);
+	return sysfs_emit(buf, "%d\n", dev->channel);
 }
 
 static DEVICE_ATTR_RO(address);
diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c
index c7869a286df4..115a23054a58 100644
--- a/net/bridge/br_arp_nd_proxy.c
+++ b/net/bridge/br_arp_nd_proxy.c
@@ -229,7 +229,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
 #endif
 
 #if IS_ENABLED(CONFIG_IPV6)
-struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *msg)
+struct nd_msg *br_is_nd_neigh_msg(const struct sk_buff *skb, struct nd_msg *msg)
 {
 	struct nd_msg *m;
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 29d6ec45cf41..1054b8a88edc 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -2299,6 +2299,6 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
 			      u16 vid, struct net_bridge_port *p);
 void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
 		       u16 vid, struct net_bridge_port *p, struct nd_msg *msg);
-struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *m);
+struct nd_msg *br_is_nd_neigh_msg(const struct sk_buff *skb, struct nd_msg *m);
 bool br_is_neigh_suppress_enabled(const struct net_bridge_port *p, u16 vid);
 #endif
diff --git a/net/core/dev.c b/net/core/dev.c
index e7223972b9aa..fda4e1039bf0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -753,6 +753,36 @@ int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
 }
 EXPORT_SYMBOL_GPL(dev_fill_forward_path);
 
+/* must be called under rcu_read_lock(), as we dont take a reference */
+static struct napi_struct *napi_by_id(unsigned int napi_id)
+{
+	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
+	struct napi_struct *napi;
+
+	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
+		if (napi->napi_id == napi_id)
+			return napi;
+
+	return NULL;
+}
+
+/* must be called under rcu_read_lock(), as we dont take a reference */
+struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id)
+{
+	struct napi_struct *napi;
+
+	napi = napi_by_id(napi_id);
+	if (!napi)
+		return NULL;
+
+	if (WARN_ON_ONCE(!napi->dev))
+		return NULL;
+	if (!net_eq(net, dev_net(napi->dev)))
+		return NULL;
+
+	return napi;
+}
+
 /**
  *	__dev_get_by_name	- find a device by its name
  *	@net: the applicable net namespace
@@ -927,7 +957,6 @@ EXPORT_SYMBOL(netdev_get_by_index);
  *	its reference counter increased so the caller must be careful
  *	about locking. The caller must hold RCU lock.
  */
-
 struct net_device *dev_get_by_napi_id(unsigned int napi_id)
 {
 	struct napi_struct *napi;
@@ -941,7 +970,6 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id)
 
 	return napi ? napi->dev : NULL;
 }
-EXPORT_SYMBOL(dev_get_by_napi_id);
 
 static DEFINE_SEQLOCK(netdev_rename_lock);
 
@@ -1769,14 +1797,19 @@ int register_netdevice_notifier(struct notifier_block *nb)
 
 	/* Close race with setup_net() and cleanup_net() */
 	down_write(&pernet_ops_rwsem);
+
+	/* When RTNL is removed, we need protection for netdev_chain. */
 	rtnl_lock();
+
 	err = raw_notifier_chain_register(&netdev_chain, nb);
 	if (err)
 		goto unlock;
 	if (dev_boot_phase)
 		goto unlock;
 	for_each_net(net) {
+		__rtnl_net_lock(net);
 		err = call_netdevice_register_net_notifiers(nb, net);
+		__rtnl_net_unlock(net);
 		if (err)
 			goto rollback;
 	}
@@ -1787,8 +1820,11 @@ unlock:
 	return err;
 
 rollback:
-	for_each_net_continue_reverse(net)
+	for_each_net_continue_reverse(net) {
+		__rtnl_net_lock(net);
 		call_netdevice_unregister_net_notifiers(nb, net);
+		__rtnl_net_unlock(net);
+	}
 
 	raw_notifier_chain_unregister(&netdev_chain, nb);
 	goto unlock;
@@ -1821,8 +1857,11 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
 	if (err)
 		goto unlock;
 
-	for_each_net(net)
+	for_each_net(net) {
+		__rtnl_net_lock(net);
 		call_netdevice_unregister_net_notifiers(nb, net);
+		__rtnl_net_unlock(net);
+	}
 
 unlock:
 	rtnl_unlock();
@@ -1886,9 +1925,10 @@ int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
 {
 	int err;
 
-	rtnl_lock();
+	rtnl_net_lock(net);
 	err = __register_netdevice_notifier_net(net, nb, false);
-	rtnl_unlock();
+	rtnl_net_unlock(net);
+
 	return err;
 }
 EXPORT_SYMBOL(register_netdevice_notifier_net);
@@ -1914,9 +1954,10 @@ int unregister_netdevice_notifier_net(struct net *net,
 {
 	int err;
 
-	rtnl_lock();
+	rtnl_net_lock(net);
 	err = __unregister_netdevice_notifier_net(net, nb);
-	rtnl_unlock();
+	rtnl_net_unlock(net);
+
 	return err;
 }
 EXPORT_SYMBOL(unregister_netdevice_notifier_net);
@@ -1933,15 +1974,17 @@ int register_netdevice_notifier_dev_net(struct net_device *dev,
 					struct notifier_block *nb,
 					struct netdev_net_notifier *nn)
 {
+	struct net *net = dev_net(dev);
 	int err;
 
-	rtnl_lock();
-	err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
+	rtnl_net_lock(net);
+	err = __register_netdevice_notifier_net(net, nb, false);
 	if (!err) {
 		nn->nb = nb;
 		list_add(&nn->list, &dev->net_notifier_list);
 	}
-	rtnl_unlock();
+	rtnl_net_unlock(net);
+
 	return err;
 }
 EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
@@ -1950,12 +1993,14 @@ int unregister_netdevice_notifier_dev_net(struct net_device *dev,
 					  struct notifier_block *nb,
 					  struct netdev_net_notifier *nn)
 {
+	struct net *net = dev_net(dev);
 	int err;
 
-	rtnl_lock();
+	rtnl_net_lock(net);
 	list_del(&nn->list);
-	err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
-	rtnl_unlock();
+	err = __unregister_netdevice_notifier_net(net, nb);
+	rtnl_net_unlock(net);
+
 	return err;
 }
 EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
@@ -3233,7 +3278,7 @@ void netif_device_attach(struct net_device *dev)
 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
 	    netif_running(dev)) {
 		netif_tx_wake_all_queues(dev);
-		__netdev_watchdog_up(dev);
+		netdev_watchdog_up(dev);
 	}
 }
 EXPORT_SYMBOL(netif_device_attach);
@@ -5476,8 +5521,14 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 	orig_dev = skb->dev;
 
 	skb_reset_network_header(skb);
+#if !defined(CONFIG_DEBUG_NET)
+	/* We plan to no longer reset the transport header here.
+	 * Give some time to fuzzers and dev build to catch bugs
+	 * in network stacks.
+	 */
 	if (!skb_transport_header_was_set(skb))
 		skb_reset_transport_header(skb);
+#endif
 	skb_reset_mac_len(skb);
 
 	pt_prev = NULL;
@@ -6293,19 +6344,6 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
 }
 EXPORT_SYMBOL(napi_complete_done);
 
-/* must be called under rcu_read_lock(), as we dont take a reference */
-struct napi_struct *napi_by_id(unsigned int napi_id)
-{
-	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
-	struct napi_struct *napi;
-
-	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
-		if (napi->napi_id == napi_id)
-			return napi;
-
-	return NULL;
-}
-
 static void skb_defer_free_flush(struct softnet_data *sd)
 {
 	struct sk_buff *skb, *next;
@@ -6713,13 +6751,14 @@ static void napi_restore_config(struct napi_struct *n)
 	n->gro_flush_timeout = n->config->gro_flush_timeout;
 	n->irq_suspend_timeout = n->config->irq_suspend_timeout;
 	/* a NAPI ID might be stored in the config, if so use it. if not, use
-	 * napi_hash_add to generate one for us. It will be saved to the config
-	 * in napi_disable.
+	 * napi_hash_add to generate one for us.
 	 */
-	if (n->config->napi_id)
+	if (n->config->napi_id) {
 		napi_hash_add_with_id(n, n->config->napi_id);
-	else
+	} else {
 		napi_hash_add(n);
+		n->config->napi_id = n->napi_id;
+	}
 }
 
 static void napi_save_config(struct napi_struct *n)
@@ -6727,10 +6766,39 @@ static void napi_save_config(struct napi_struct *n)
 	n->config->defer_hard_irqs = n->defer_hard_irqs;
 	n->config->gro_flush_timeout = n->gro_flush_timeout;
 	n->config->irq_suspend_timeout = n->irq_suspend_timeout;
-	n->config->napi_id = n->napi_id;
 	napi_hash_del(n);
 }
 
+/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will
+ * inherit an existing ID try to insert it at the right position.
+ */
+static void
+netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi)
+{
+	unsigned int new_id, pos_id;
+	struct list_head *higher;
+	struct napi_struct *pos;
+
+	new_id = UINT_MAX;
+	if (napi->config && napi->config->napi_id)
+		new_id = napi->config->napi_id;
+
+	higher = &dev->napi_list;
+	list_for_each_entry(pos, &dev->napi_list, dev_list) {
+		if (pos->napi_id >= MIN_NAPI_ID)
+			pos_id = pos->napi_id;
+		else if (pos->config)
+			pos_id = pos->config->napi_id;
+		else
+			pos_id = UINT_MAX;
+
+		if (pos_id <= new_id)
+			break;
+		higher = &pos->dev_list;
+	}
+	list_add_rcu(&napi->dev_list, higher); /* adds after higher */
+}
+
 void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 			   int (*poll)(struct napi_struct *, int), int weight)
 {
@@ -6757,7 +6825,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 	napi->list_owner = -1;
 	set_bit(NAPI_STATE_SCHED, &napi->state);
 	set_bit(NAPI_STATE_NPSVC, &napi->state);
-	list_add_rcu(&napi->dev_list, &dev->napi_list);
+	netif_napi_dev_list_add(dev, napi);
 
 	/* default settings from sysfs are applied to all NAPIs. any per-NAPI
 	 * configuration will be loaded in napi_enable
@@ -10668,26 +10736,20 @@ err_free_name:
 EXPORT_SYMBOL(register_netdevice);
 
 /* Initialize the core of a dummy net device.
- * This is useful if you are calling this function after alloc_netdev(),
- * since it does not memset the net_device fields.
+ * The setup steps dummy netdevs need which normal netdevs get by going
+ * through register_netdevice().
  */
-static void init_dummy_netdev_core(struct net_device *dev)
+static void init_dummy_netdev(struct net_device *dev)
 {
 	/* make sure we BUG if trying to hit standard
 	 * register/unregister code path
 	 */
 	dev->reg_state = NETREG_DUMMY;
 
-	/* NAPI wants this */
-	INIT_LIST_HEAD(&dev->napi_list);
-
 	/* a dummy interface is started by default */
 	set_bit(__LINK_STATE_PRESENT, &dev->state);
 	set_bit(__LINK_STATE_START, &dev->state);
 
-	/* napi_busy_loop stats accounting wants this */
-	dev_net_set(dev, &init_net);
-
 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
 	 * because users of this 'device' dont need to change
 	 * its refcount.
@@ -10695,28 +10757,6 @@ static void init_dummy_netdev_core(struct net_device *dev)
 }
 
 /**
- *	init_dummy_netdev	- init a dummy network device for NAPI
- *	@dev: device to init
- *
- *	This takes a network device structure and initializes the minimum
- *	amount of fields so it can be used to schedule NAPI polls without
- *	registering a full blown interface. This is to be used by drivers
- *	that need to tie several hardware interfaces to a single NAPI
- *	poll scheduler due to HW limitations.
- */
-void init_dummy_netdev(struct net_device *dev)
-{
-	/* Clear everything. Note we don't initialize spinlocks
-	 * as they aren't supposed to be taken by any of the
-	 * NAPI code and this dummy netdev is supposed to be
-	 * only ever used for NAPI polls
-	 */
-	memset(dev, 0, sizeof(struct net_device));
-	init_dummy_netdev_core(dev);
-}
-EXPORT_SYMBOL_GPL(init_dummy_netdev);
-
-/**
  *	register_netdev	- register a network device
  *	@dev: device to register
  *
@@ -10731,12 +10771,16 @@ EXPORT_SYMBOL_GPL(init_dummy_netdev);
  */
 int register_netdev(struct net_device *dev)
 {
+	struct net *net = dev_net(dev);
 	int err;
 
-	if (rtnl_lock_killable())
+	if (rtnl_net_lock_killable(net))
 		return -EINTR;
+
 	err = register_netdevice(dev);
-	rtnl_unlock();
+
+	rtnl_net_unlock(net);
+
 	return err;
 }
 EXPORT_SYMBOL(register_netdev);
@@ -11390,7 +11434,7 @@ EXPORT_SYMBOL(free_netdev);
 struct net_device *alloc_netdev_dummy(int sizeof_priv)
 {
 	return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN,
-			    init_dummy_netdev_core);
+			    init_dummy_netdev);
 }
 EXPORT_SYMBOL_GPL(alloc_netdev_dummy);
 
@@ -11606,9 +11650,11 @@ EXPORT_SYMBOL(unregister_netdevice_many);
  */
 void unregister_netdev(struct net_device *dev)
 {
-	rtnl_lock();
+	struct net *net = dev_net(dev);
+
+	rtnl_net_lock(net);
 	unregister_netdevice(dev);
-	rtnl_unlock();
+	rtnl_net_unlock(net);
 }
 EXPORT_SYMBOL(unregister_netdev);
 
diff --git a/net/core/dev.h b/net/core/dev.h
index aa91eed55a40..d8966847794c 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -22,6 +22,9 @@ struct sd_flow_limit {
 
 extern int netdev_flow_limit_table_len;
 
+struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id);
+struct net_device *dev_get_by_napi_id(unsigned int napi_id);
+
 #ifdef CONFIG_PROC_FS
 int __init dev_proc_init(void);
 #else
@@ -269,7 +272,6 @@ void xdp_do_check_flushed(struct napi_struct *napi);
 static inline void xdp_do_check_flushed(struct napi_struct *napi) { }
 #endif
 
-struct napi_struct *napi_by_id(unsigned int napi_id);
 void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu);
 
 #define XMIT_RECURSION_LIMIT	8
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 1b4d39e38084..cb04ef2b9807 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -42,14 +42,18 @@ static unsigned int default_operstate(const struct net_device *dev)
 	 * first check whether lower is indeed the source of its down state.
 	 */
 	if (!netif_carrier_ok(dev)) {
-		int iflink = dev_get_iflink(dev);
 		struct net_device *peer;
+		int iflink;
 
 		/* If called from netdev_run_todo()/linkwatch_sync_dev(),
 		 * dev_net(dev) can be already freed, and RTNL is not held.
 		 */
-		if (dev->reg_state == NETREG_UNREGISTERED ||
-		    iflink == dev->ifindex)
+		if (dev->reg_state <= NETREG_REGISTERED)
+			iflink = dev_get_iflink(dev);
+		else
+			iflink = dev->ifindex;
+
+		if (iflink == dev->ifindex)
 			return IF_OPER_DOWN;
 
 		ASSERT_RTNL();
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index b0772d135efb..c59619a2ec23 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -167,8 +167,6 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
 	void *hdr;
 	pid_t pid;
 
-	if (WARN_ON_ONCE(!napi->dev))
-		return -EINVAL;
 	if (!(napi->dev->flags & IFF_UP))
 		return 0;
 
@@ -176,8 +174,7 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
 	if (!hdr)
 		return -EMSGSIZE;
 
-	if (napi->napi_id >= MIN_NAPI_ID &&
-	    nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id))
+	if (nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id))
 		goto nla_put_failure;
 
 	if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex))
@@ -235,7 +232,7 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info)
 	rtnl_lock();
 	rcu_read_lock();
 
-	napi = napi_by_id(napi_id);
+	napi = netdev_napi_by_id(genl_info_net(info), napi_id);
 	if (napi) {
 		err = netdev_nl_napi_fill_one(rsp, napi, info);
 	} else {
@@ -266,12 +263,21 @@ netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp,
 			struct netdev_nl_dump_ctx *ctx)
 {
 	struct napi_struct *napi;
+	unsigned int prev_id;
 	int err = 0;
 
 	if (!(netdev->flags & IFF_UP))
 		return err;
 
+	prev_id = UINT_MAX;
 	list_for_each_entry(napi, &netdev->napi_list, dev_list) {
+		if (napi->napi_id < MIN_NAPI_ID)
+			continue;
+
+		/* Dump continuation below depends on the list being sorted */
+		WARN_ON_ONCE(napi->napi_id >= prev_id);
+		prev_id = napi->napi_id;
+
 		if (ctx->napi_id && napi->napi_id >= ctx->napi_id)
 			continue;
 
@@ -354,7 +360,7 @@ int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info)
 	rtnl_lock();
 	rcu_read_lock();
 
-	napi = napi_by_id(napi_id);
+	napi = netdev_napi_by_id(genl_info_net(info), napi_id);
 	if (napi) {
 		err = netdev_nl_napi_set_config(napi, info);
 	} else {
diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c
index e217a5838c87..db82786fa0c4 100644
--- a/net/core/netdev_rx_queue.c
+++ b/net/core/netdev_rx_queue.c
@@ -79,3 +79,4 @@ err_free_new_mem:
 
 	return err;
 }
+EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL");
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 6b745096809d..1f4d4b5570ab 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -84,7 +84,6 @@ int rtnl_lock_killable(void)
 {
 	return mutex_lock_killable(&rtnl_mutex);
 }
-EXPORT_SYMBOL(rtnl_lock_killable);
 
 static struct sk_buff *defer_kfree_skb_list;
 void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail)
@@ -221,6 +220,16 @@ int rtnl_net_trylock(struct net *net)
 }
 EXPORT_SYMBOL(rtnl_net_trylock);
 
+int rtnl_net_lock_killable(struct net *net)
+{
+	int ret = rtnl_lock_killable();
+
+	if (!ret)
+		__rtnl_net_lock(net);
+
+	return ret;
+}
+
 static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b)
 {
 	if (net_eq(net_a, net_b))
diff --git a/net/dsa/user.c b/net/dsa/user.c
index 4a8de48a6f24..c74f2b2b92de 100644
--- a/net/dsa/user.c
+++ b/net/dsa/user.c
@@ -1251,7 +1251,6 @@ static int dsa_user_get_eee(struct net_device *dev, struct ethtool_keee *e)
 {
 	struct dsa_port *dp = dsa_user_to_port(dev);
 	struct dsa_switch *ds = dp->ds;
-	int ret;
 
 	/* Check whether the switch supports EEE */
 	if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index))
@@ -1261,13 +1260,6 @@ static int dsa_user_get_eee(struct net_device *dev, struct ethtool_keee *e)
 	if (!dev->phydev)
 		return -ENODEV;
 
-	if (!ds->ops->get_mac_eee)
-		return -EOPNOTSUPP;
-
-	ret = ds->ops->get_mac_eee(ds, dp->index, e);
-	if (ret)
-		return ret;
-
 	return phylink_ethtool_get_eee(dp->pl, e);
 }
 
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 2607aea1fbfb..2bd77c94f9f1 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -869,6 +869,7 @@ int __ethtool_get_ts_info(struct net_device *dev,
 			  struct kernel_ethtool_ts_info *info)
 {
 	struct hwtstamp_provider *hwprov;
+	int err = 0;
 
 	rcu_read_lock();
 	hwprov = rcu_dereference(dev->hwprov);
@@ -876,7 +877,6 @@ int __ethtool_get_ts_info(struct net_device *dev,
 	if (!hwprov) {
 		const struct ethtool_ops *ops = dev->ethtool_ops;
 		struct phy_device *phydev = dev->phydev;
-		int err = 0;
 
 		ethtool_init_tsinfo(info);
 		if (phy_is_default_hwtstamp(phydev) &&
@@ -892,8 +892,9 @@ int __ethtool_get_ts_info(struct net_device *dev,
 		return err;
 	}
 
+	err = ethtool_get_ts_info_by_phc(dev, info, &hwprov->desc);
 	rcu_read_unlock();
-	return ethtool_get_ts_info_by_phc(dev, info, &hwprov->desc);
+	return err;
 }
 
 bool net_support_hwtstamp_qualifier(struct net_device *dev,
diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c
index 34d76e87847d..af19e1bed303 100644
--- a/net/ethtool/linkstate.c
+++ b/net/ethtool/linkstate.c
@@ -3,6 +3,7 @@
 #include "netlink.h"
 #include "common.h"
 #include <linux/phy.h>
+#include <linux/phylib_stubs.h>
 
 struct linkstate_req_info {
 	struct ethnl_req_info		base;
@@ -26,9 +27,8 @@ const struct nla_policy ethnl_linkstate_get_policy[] = {
 		NLA_POLICY_NESTED(ethnl_header_policy_stats),
 };
 
-static int linkstate_get_sqi(struct net_device *dev)
+static int linkstate_get_sqi(struct phy_device *phydev)
 {
-	struct phy_device *phydev = dev->phydev;
 	int ret;
 
 	if (!phydev)
@@ -46,9 +46,8 @@ static int linkstate_get_sqi(struct net_device *dev)
 	return ret;
 }
 
-static int linkstate_get_sqi_max(struct net_device *dev)
+static int linkstate_get_sqi_max(struct phy_device *phydev)
 {
-	struct phy_device *phydev = dev->phydev;
 	int ret;
 
 	if (!phydev)
@@ -100,19 +99,28 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base,
 {
 	struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base);
 	struct net_device *dev = reply_base->dev;
+	struct nlattr **tb = info->attrs;
+	struct phy_device *phydev;
 	int ret;
 
+	phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_LINKSTATE_HEADER],
+				      info->extack);
+	if (IS_ERR(phydev)) {
+		ret = PTR_ERR(phydev);
+		goto out;
+	}
+
 	ret = ethnl_ops_begin(dev);
 	if (ret < 0)
 		return ret;
 	data->link = __ethtool_get_link(dev);
 
-	ret = linkstate_get_sqi(dev);
+	ret = linkstate_get_sqi(phydev);
 	if (linkstate_sqi_critical_error(ret))
 		goto out;
 	data->sqi = ret;
 
-	ret = linkstate_get_sqi_max(dev);
+	ret = linkstate_get_sqi_max(phydev);
 	if (linkstate_sqi_critical_error(ret))
 		goto out;
 	data->sqi_max = ret;
@@ -127,9 +135,9 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base,
 			   sizeof(data->link_stats) / 8);
 
 	if (req_base->flags & ETHTOOL_FLAG_STATS) {
-		if (dev->phydev)
-			data->link_stats.link_down_events =
-				READ_ONCE(dev->phydev->link_down_events);
+		if (phydev)
+			phy_ethtool_get_link_ext_stats(phydev,
+						       &data->link_stats);
 
 		if (dev->ethtool_ops->get_link_ext_stats)
 			dev->ethtool_ops->get_link_ext_stats(dev,
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 0a09298fff92..1ce0a3de1430 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -511,5 +511,6 @@ extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING
 extern const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN];
 extern const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN];
 extern const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN];
+extern const char stats_phy_names[__ETHTOOL_A_STATS_PHY_CNT][ETH_GSTRING_LEN];
 
 #endif /* _NET_ETHTOOL_NETLINK_H */
diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c
index 912f0c4fff2f..038a2558f052 100644
--- a/net/ethtool/stats.c
+++ b/net/ethtool/stats.c
@@ -1,5 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
+#include <linux/phy.h>
+#include <linux/phylib_stubs.h>
+
 #include "netlink.h"
 #include "common.h"
 #include "bitset.h"
@@ -20,6 +23,7 @@ struct stats_reply_data {
 		struct ethtool_eth_mac_stats	mac_stats;
 		struct ethtool_eth_ctrl_stats	ctrl_stats;
 		struct ethtool_rmon_stats	rmon_stats;
+		struct ethtool_phy_stats	phydev_stats;
 	);
 	const struct ethtool_rmon_hist_range	*rmon_ranges;
 };
@@ -32,6 +36,7 @@ const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = {
 	[ETHTOOL_STATS_ETH_MAC]			= "eth-mac",
 	[ETHTOOL_STATS_ETH_CTRL]		= "eth-ctrl",
 	[ETHTOOL_STATS_RMON]			= "rmon",
+	[ETHTOOL_STATS_PHY]			= "phydev",
 };
 
 const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = {
@@ -76,6 +81,15 @@ const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN] = {
 	[ETHTOOL_A_STATS_RMON_JABBER]		= "etherStatsJabbers",
 };
 
+const char stats_phy_names[__ETHTOOL_A_STATS_PHY_CNT][ETH_GSTRING_LEN] = {
+	[ETHTOOL_A_STATS_PHY_RX_PKTS]		= "RxFrames",
+	[ETHTOOL_A_STATS_PHY_RX_BYTES]		= "RxOctets",
+	[ETHTOOL_A_STATS_PHY_RX_ERRORS]		= "RxErrors",
+	[ETHTOOL_A_STATS_PHY_TX_PKTS]		= "TxFrames",
+	[ETHTOOL_A_STATS_PHY_TX_BYTES]		= "TxOctets",
+	[ETHTOOL_A_STATS_PHY_TX_ERRORS]		= "TxErrors",
+};
+
 const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_SRC + 1] = {
 	[ETHTOOL_A_STATS_HEADER]	=
 		NLA_POLICY_NESTED(ethnl_header_policy),
@@ -120,8 +134,15 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base,
 	struct stats_reply_data *data = STATS_REPDATA(reply_base);
 	enum ethtool_mac_stats_src src = req_info->src;
 	struct net_device *dev = reply_base->dev;
+	struct nlattr **tb = info->attrs;
+	struct phy_device *phydev;
 	int ret;
 
+	phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_STATS_HEADER],
+				      info->extack);
+	if (IS_ERR(phydev))
+		return PTR_ERR(phydev);
+
 	ret = ethnl_ops_begin(dev);
 	if (ret < 0)
 		return ret;
@@ -145,6 +166,14 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base,
 	data->ctrl_stats.src = src;
 	data->rmon_stats.src = src;
 
+	if ((test_bit(ETHTOOL_STATS_PHY, req_info->stat_mask) ||
+	     test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) &&
+	    src == ETHTOOL_MAC_STATS_SRC_AGGREGATE) {
+		if (phydev)
+			phy_ethtool_get_phy_stats(phydev, &data->phy_stats,
+						  &data->phydev_stats);
+	}
+
 	if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) &&
 	    dev->ethtool_ops->get_eth_phy_stats)
 		dev->ethtool_ops->get_eth_phy_stats(dev, &data->phy_stats);
@@ -194,6 +223,10 @@ static int stats_reply_size(const struct ethnl_req_info *req_base,
 			nla_total_size(4)) *	/* _A_STATS_GRP_HIST_BKT_HI */
 			ETHTOOL_RMON_HIST_MAX * 2;
 	}
+	if (test_bit(ETHTOOL_STATS_PHY, req_info->stat_mask)) {
+		n_stats += sizeof(struct ethtool_phy_stats) / sizeof(u64);
+		n_grps++;
+	}
 
 	len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */
 			 nla_total_size(4) + /* _A_STATS_GRP_ID */
@@ -247,6 +280,25 @@ static int stats_put_phy_stats(struct sk_buff *skb,
 	return 0;
 }
 
+static int stats_put_phydev_stats(struct sk_buff *skb,
+				  const struct stats_reply_data *data)
+{
+	if (stat_put(skb, ETHTOOL_A_STATS_PHY_RX_PKTS,
+		     data->phydev_stats.rx_packets) ||
+	    stat_put(skb, ETHTOOL_A_STATS_PHY_RX_BYTES,
+		     data->phydev_stats.rx_bytes) ||
+	    stat_put(skb, ETHTOOL_A_STATS_PHY_RX_ERRORS,
+		     data->phydev_stats.rx_errors) ||
+	    stat_put(skb, ETHTOOL_A_STATS_PHY_TX_PKTS,
+		     data->phydev_stats.tx_packets) ||
+	    stat_put(skb, ETHTOOL_A_STATS_PHY_TX_BYTES,
+		     data->phydev_stats.tx_bytes) ||
+	    stat_put(skb, ETHTOOL_A_STATS_PHY_TX_ERRORS,
+		     data->phydev_stats.tx_errors))
+		return -EMSGSIZE;
+	return 0;
+}
+
 static int stats_put_mac_stats(struct sk_buff *skb,
 			       const struct stats_reply_data *data)
 {
@@ -423,6 +475,9 @@ static int stats_fill_reply(struct sk_buff *skb,
 	if (!ret && test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask))
 		ret = stats_put_stats(skb, data, ETHTOOL_STATS_RMON,
 				      ETH_SS_STATS_RMON, stats_put_rmon_stats);
+	if (!ret && test_bit(ETHTOOL_STATS_PHY, req_info->stat_mask))
+		ret = stats_put_stats(skb, data, ETHTOOL_STATS_PHY,
+				      ETH_SS_STATS_PHY, stats_put_phydev_stats);
 
 	return ret;
 }
diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c
index b3382b3cf325..818cf01f0911 100644
--- a/net/ethtool/strset.c
+++ b/net/ethtool/strset.c
@@ -105,6 +105,11 @@ static const struct strset_info info_template[] = {
 		.count		= __ETHTOOL_A_STATS_RMON_CNT,
 		.strings	= stats_rmon_names,
 	},
+	[ETH_SS_STATS_PHY] = {
+		.per_dev	= false,
+		.count		= __ETHTOOL_A_STATS_PHY_CNT,
+		.strings	= stats_phy_names,
+	},
 };
 
 struct strset_req_info {
diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
index fcfeb79bb040..7d7551e6f0b0 100644
--- a/net/hsr/hsr_main.h
+++ b/net/hsr/hsr_main.h
@@ -163,6 +163,7 @@ struct hsr_port {
 	struct net_device	*dev;
 	struct hsr_priv		*hsr;
 	enum hsr_port_type	type;
+	struct rcu_head		rcu;
 };
 
 struct hsr_frame_info;
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
index 464f683e016d..2a802a5de2ac 100644
--- a/net/hsr/hsr_slave.c
+++ b/net/hsr/hsr_slave.c
@@ -204,7 +204,6 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev,
 	}
 
 	list_add_tail_rcu(&port->port_list, &hsr->ports);
-	synchronize_rcu();
 
 	master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
 	netdev_update_features(master->dev);
@@ -235,7 +234,5 @@ void hsr_del_port(struct hsr_port *port)
 		netdev_upper_dev_unlink(port->dev, master->dev);
 	}
 
-	synchronize_rcu();
-
-	kfree(port);
+	kfree_rcu(port, rcu);
 }
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index f3281312eb5e..b0fbf804bbba 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -816,7 +816,8 @@ int esp_input_done2(struct sk_buff *skb, int err)
 	}
 
 	skb_pull_rcsum(skb, hlen);
-	if (x->props.mode == XFRM_MODE_TUNNEL)
+	if (x->props.mode == XFRM_MODE_TUNNEL ||
+	    x->props.mode == XFRM_MODE_IPTFS)
 		skb_reset_transport_header(skb);
 	else
 		skb_set_transport_header(skb, -ihl);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e45222d5fc2e..cc2b5194a18d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -898,7 +898,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
 	sock_net_set(ctl_sk, net);
 	if (sk) {
 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
-				   inet_twsk(sk)->tw_mark : sk->sk_mark;
+				   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
 		transmit_time = tcp_transmit_time(sk);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 4da409bc4577..c3729382be3b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -5240,9 +5240,9 @@ int inet6_fill_ifmcaddr(struct sk_buff *skb,
 	return 0;
 }
 
-static int inet6_fill_ifacaddr(struct sk_buff *skb,
-			       const struct ifacaddr6 *ifaca,
-			       struct inet6_fill_args *args)
+int inet6_fill_ifacaddr(struct sk_buff *skb,
+			const struct ifacaddr6 *ifaca,
+			struct inet6_fill_args *args)
 {
 	struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt);
 	int ifindex = dev ? dev->ifindex : 1;
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 562cace50ca9..21e01695b48c 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -278,6 +278,37 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i,
 	return aca;
 }
 
+static void inet6_ifacaddr_notify(struct net_device *dev,
+				  const struct ifacaddr6 *ifaca, int event)
+{
+	struct inet6_fill_args fillargs = {
+		.event = event,
+		.netnsid = -1,
+	};
+	struct net *net = dev_net(dev);
+	struct sk_buff *skb;
+	int err = -ENOMEM;
+
+	skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) +
+			nla_total_size(sizeof(struct in6_addr)) +
+			nla_total_size(sizeof(struct ifa_cacheinfo)),
+			GFP_KERNEL);
+	if (!skb)
+		goto error;
+
+	err = inet6_fill_ifacaddr(skb, ifaca, &fillargs);
+	if (err < 0) {
+		pr_err("Failed to fill in anycast addresses (err %d)\n", err);
+		nlmsg_free(skb);
+		goto error;
+	}
+
+	rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ACADDR, NULL, GFP_KERNEL);
+	return;
+error:
+	rtnl_set_sk_err(net, RTNLGRP_IPV6_ACADDR, err);
+}
+
 /*
  *	device anycast group inc (add if not found)
  */
@@ -333,6 +364,8 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 
 	addrconf_join_solict(idev->dev, &aca->aca_addr);
 
+	inet6_ifacaddr_notify(idev->dev, aca, RTM_NEWANYCAST);
+
 	aca_put(aca);
 	return 0;
 out:
@@ -375,6 +408,8 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
 
 	ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false);
 
+	inet6_ifacaddr_notify(idev->dev, aca, RTM_DELANYCAST);
+
 	aca_put(aca);
 	return 0;
 }
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index b2400c226a32..5f3d0cc1555a 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -859,7 +859,8 @@ int esp6_input_done2(struct sk_buff *skb, int err)
 	skb_postpull_rcsum(skb, skb_network_header(skb),
 			   skb_network_header_len(skb));
 	skb_pull_rcsum(skb, hlen);
-	if (x->props.mode == XFRM_MODE_TUNNEL)
+	if (x->props.mode == XFRM_MODE_TUNNEL ||
+	    x->props.mode == XFRM_MODE_IPTFS)
 		skb_reset_transport_header(skb);
 	else
 		skb_set_transport_header(skb, -hdr_len);
diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c
index c0e2da5072be..9e4631fade90 100644
--- a/net/mac802154/iface.c
+++ b/net/mac802154/iface.c
@@ -684,6 +684,10 @@ void ieee802154_if_remove(struct ieee802154_sub_if_data *sdata)
 	ASSERT_RTNL();
 
 	mutex_lock(&sdata->local->iflist_mtx);
+	if (list_empty(&sdata->local->interfaces)) {
+		mutex_unlock(&sdata->local->iflist_mtx);
+		return;
+	}
 	list_del_rcu(&sdata->list);
 	mutex_unlock(&sdata->local->iflist_mtx);
 
diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c
index 38d8121331d4..b0dd008e2114 100644
--- a/net/mptcp/ctrl.c
+++ b/net/mptcp/ctrl.c
@@ -102,16 +102,15 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
 }
 
 #ifdef CONFIG_SYSCTL
-static int mptcp_set_scheduler(const struct net *net, const char *name)
+static int mptcp_set_scheduler(char *scheduler, const char *name)
 {
-	struct mptcp_pernet *pernet = mptcp_get_pernet(net);
 	struct mptcp_sched_ops *sched;
 	int ret = 0;
 
 	rcu_read_lock();
 	sched = mptcp_sched_find(name);
 	if (sched)
-		strscpy(pernet->scheduler, name, MPTCP_SCHED_NAME_MAX);
+		strscpy(scheduler, name, MPTCP_SCHED_NAME_MAX);
 	else
 		ret = -ENOENT;
 	rcu_read_unlock();
@@ -122,7 +121,7 @@ static int mptcp_set_scheduler(const struct net *net, const char *name)
 static int proc_scheduler(const struct ctl_table *ctl, int write,
 			  void *buffer, size_t *lenp, loff_t *ppos)
 {
-	const struct net *net = current->nsproxy->net_ns;
+	char (*scheduler)[MPTCP_SCHED_NAME_MAX] = ctl->data;
 	char val[MPTCP_SCHED_NAME_MAX];
 	struct ctl_table tbl = {
 		.data = val,
@@ -130,11 +129,11 @@ static int proc_scheduler(const struct ctl_table *ctl, int write,
 	};
 	int ret;
 
-	strscpy(val, mptcp_get_scheduler(net), MPTCP_SCHED_NAME_MAX);
+	strscpy(val, *scheduler, MPTCP_SCHED_NAME_MAX);
 
 	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
 	if (write && ret == 0)
-		ret = mptcp_set_scheduler(net, val);
+		ret = mptcp_set_scheduler(*scheduler, val);
 
 	return ret;
 }
@@ -161,7 +160,9 @@ static int proc_blackhole_detect_timeout(const struct ctl_table *table,
 					 int write, void *buffer, size_t *lenp,
 					 loff_t *ppos)
 {
-	struct mptcp_pernet *pernet = mptcp_get_pernet(current->nsproxy->net_ns);
+	struct mptcp_pernet *pernet = container_of(table->data,
+						   struct mptcp_pernet,
+						   blackhole_timeout);
 	int ret;
 
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
@@ -228,7 +229,7 @@ static struct ctl_table mptcp_sysctl_table[] = {
 	{
 		.procname = "available_schedulers",
 		.maxlen	= MPTCP_SCHED_BUF_MAX,
-		.mode = 0644,
+		.mode = 0444,
 		.proc_handler = proc_available_schedulers,
 	},
 	{
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 1b2e7cbb577f..c44c89ecaca6 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -47,7 +47,7 @@ static void __mptcp_destroy_sock(struct sock *sk);
 static void mptcp_check_send_data_fin(struct sock *sk);
 
 DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
-static struct net_device mptcp_napi_dev;
+static struct net_device *mptcp_napi_dev;
 
 /* Returns end sequence number of the receiver's advertised window */
 static u64 mptcp_wnd_end(const struct mptcp_sock *msk)
@@ -4147,11 +4147,13 @@ void __init mptcp_proto_init(void)
 	if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL))
 		panic("Failed to allocate MPTCP pcpu counter\n");
 
-	init_dummy_netdev(&mptcp_napi_dev);
+	mptcp_napi_dev = alloc_netdev_dummy(0);
+	if (!mptcp_napi_dev)
+		panic("Failed to allocate MPTCP dummy netdev\n");
 	for_each_possible_cpu(cpu) {
 		delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu);
 		INIT_LIST_HEAD(&delegated->head);
-		netif_napi_add_tx(&mptcp_napi_dev, &delegated->napi,
+		netif_napi_add_tx(mptcp_napi_dev, &delegated->napi,
 				  mptcp_napi_poll);
 		napi_enable(&delegated->napi);
 	}
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 9db3e2b0b1c3..456446d7af20 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -2517,12 +2517,15 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
 	struct hlist_nulls_head *hash;
 	unsigned int nr_slots, i;
 
-	if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head)))
+	if (*sizep > (INT_MAX / sizeof(struct hlist_nulls_head)))
 		return NULL;
 
 	BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
 	nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
 
+	if (nr_slots > (INT_MAX / sizeof(struct hlist_nulls_head)))
+		return NULL;
+
 	hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL);
 
 	if (hash && nulls)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index f7ca7165e66e..83f3face8bb3 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -8821,6 +8821,7 @@ static void nft_unregister_flowtable_hook(struct net *net,
 }
 
 static void __nft_unregister_flowtable_net_hooks(struct net *net,
+						 struct nft_flowtable *flowtable,
 						 struct list_head *hook_list,
 					         bool release_netdev)
 {
@@ -8828,6 +8829,8 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net,
 
 	list_for_each_entry_safe(hook, next, hook_list, list) {
 		nf_unregister_net_hook(net, &hook->ops);
+		flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
+					    FLOW_BLOCK_UNBIND);
 		if (release_netdev) {
 			list_del(&hook->list);
 			kfree_rcu(hook, rcu);
@@ -8836,9 +8839,10 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net,
 }
 
 static void nft_unregister_flowtable_net_hooks(struct net *net,
+					       struct nft_flowtable *flowtable,
 					       struct list_head *hook_list)
 {
-	__nft_unregister_flowtable_net_hooks(net, hook_list, false);
+	__nft_unregister_flowtable_net_hooks(net, flowtable, hook_list, false);
 }
 
 static int nft_register_flowtable_net_hooks(struct net *net,
@@ -9480,8 +9484,6 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
 
 	flowtable->data.type->free(&flowtable->data);
 	list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
-		flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
-					    FLOW_BLOCK_UNBIND);
 		list_del_rcu(&hook->list);
 		kfree_rcu(hook, rcu);
 	}
@@ -10869,6 +10871,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 							   &nft_trans_flowtable_hooks(trans),
 							   trans->msg_type);
 				nft_unregister_flowtable_net_hooks(net,
+								   nft_trans_flowtable(trans),
 								   &nft_trans_flowtable_hooks(trans));
 			} else {
 				list_del_rcu(&nft_trans_flowtable(trans)->list);
@@ -10877,6 +10880,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 							   NULL,
 							   trans->msg_type);
 				nft_unregister_flowtable_net_hooks(net,
+						nft_trans_flowtable(trans),
 						&nft_trans_flowtable(trans)->hook_list);
 			}
 			break;
@@ -11139,11 +11143,13 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
 		case NFT_MSG_NEWFLOWTABLE:
 			if (nft_trans_flowtable_update(trans)) {
 				nft_unregister_flowtable_net_hooks(net,
+						nft_trans_flowtable(trans),
 						&nft_trans_flowtable_hooks(trans));
 			} else {
 				nft_use_dec_restore(&table->use);
 				list_del_rcu(&nft_trans_flowtable(trans)->list);
 				nft_unregister_flowtable_net_hooks(net,
+						nft_trans_flowtable(trans),
 						&nft_trans_flowtable(trans)->hook_list);
 			}
 			break;
@@ -11736,7 +11742,8 @@ static void __nft_release_hook(struct net *net, struct nft_table *table)
 	list_for_each_entry(chain, &table->chains, list)
 		__nf_tables_unregister_hook(net, table, chain, true);
 	list_for_each_entry(flowtable, &table->flowtables, list)
-		__nft_unregister_flowtable_net_hooks(net, &flowtable->hook_list,
+		__nft_unregister_flowtable_net_hooks(net, flowtable,
+						     &flowtable->hook_list,
 						     true);
 }
 
diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c
index 8a07b46cc8fb..3210cfc966ab 100644
--- a/net/netfilter/nft_xfrm.c
+++ b/net/netfilter/nft_xfrm.c
@@ -112,7 +112,8 @@ static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode)
 		return true;
 	}
 
-	return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL;
+	return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL ||
+	       mode == XFRM_MODE_IPTFS;
 }
 
 static void nft_xfrm_state_get_key(const struct nft_xfrm *priv,
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 351ac1747224..0581c53e6517 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -61,8 +61,10 @@ static atomic_t rds_tcp_unloading = ATOMIC_INIT(0);
 
 static struct kmem_cache *rds_tcp_conn_slab;
 
-static int rds_tcp_skbuf_handler(const struct ctl_table *ctl, int write,
-				 void *buffer, size_t *lenp, loff_t *fpos);
+static int rds_tcp_sndbuf_handler(const struct ctl_table *ctl, int write,
+				  void *buffer, size_t *lenp, loff_t *fpos);
+static int rds_tcp_rcvbuf_handler(const struct ctl_table *ctl, int write,
+				  void *buffer, size_t *lenp, loff_t *fpos);
 
 static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF;
 static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF;
@@ -74,7 +76,7 @@ static struct ctl_table rds_tcp_sysctl_table[] = {
 		/* data is per-net pointer */
 		.maxlen         = sizeof(int),
 		.mode           = 0644,
-		.proc_handler   = rds_tcp_skbuf_handler,
+		.proc_handler   = rds_tcp_sndbuf_handler,
 		.extra1		= &rds_tcp_min_sndbuf,
 	},
 #define	RDS_TCP_RCVBUF	1
@@ -83,7 +85,7 @@ static struct ctl_table rds_tcp_sysctl_table[] = {
 		/* data is per-net pointer */
 		.maxlen         = sizeof(int),
 		.mode           = 0644,
-		.proc_handler   = rds_tcp_skbuf_handler,
+		.proc_handler   = rds_tcp_rcvbuf_handler,
 		.extra1		= &rds_tcp_min_rcvbuf,
 	},
 };
@@ -682,10 +684,10 @@ static void rds_tcp_sysctl_reset(struct net *net)
 	spin_unlock_irq(&rds_tcp_conn_lock);
 }
 
-static int rds_tcp_skbuf_handler(const struct ctl_table *ctl, int write,
+static int rds_tcp_skbuf_handler(struct rds_tcp_net *rtn,
+				 const struct ctl_table *ctl, int write,
 				 void *buffer, size_t *lenp, loff_t *fpos)
 {
-	struct net *net = current->nsproxy->net_ns;
 	int err;
 
 	err = proc_dointvec_minmax(ctl, write, buffer, lenp, fpos);
@@ -694,11 +696,34 @@ static int rds_tcp_skbuf_handler(const struct ctl_table *ctl, int write,
 			*(int *)(ctl->extra1));
 		return err;
 	}
-	if (write)
+
+	if (write && rtn->rds_tcp_listen_sock && rtn->rds_tcp_listen_sock->sk) {
+		struct net *net = sock_net(rtn->rds_tcp_listen_sock->sk);
+
 		rds_tcp_sysctl_reset(net);
+	}
+
 	return 0;
 }
 
+static int rds_tcp_sndbuf_handler(const struct ctl_table *ctl, int write,
+				  void *buffer, size_t *lenp, loff_t *fpos)
+{
+	struct rds_tcp_net *rtn = container_of(ctl->data, struct rds_tcp_net,
+					       sndbuf_size);
+
+	return rds_tcp_skbuf_handler(rtn, ctl, write, buffer, lenp, fpos);
+}
+
+static int rds_tcp_rcvbuf_handler(const struct ctl_table *ctl, int write,
+				  void *buffer, size_t *lenp, loff_t *fpos)
+{
+	struct rds_tcp_net *rtn = container_of(ctl->data, struct rds_tcp_net,
+					       rcvbuf_size);
+
+	return rds_tcp_skbuf_handler(rtn, ctl, write, buffer, lenp, fpos);
+}
+
 static void rds_tcp_exit(void)
 {
 	rds_tcp_set_unloading();
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 5502998aace7..5c2580a07530 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -356,7 +356,8 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = {
 	[TCA_FLOW_KEYS]		= { .type = NLA_U32 },
 	[TCA_FLOW_MODE]		= { .type = NLA_U32 },
 	[TCA_FLOW_BASECLASS]	= { .type = NLA_U32 },
-	[TCA_FLOW_RSHIFT]	= { .type = NLA_U32 },
+	[TCA_FLOW_RSHIFT]	= NLA_POLICY_MAX(NLA_U32,
+						 31 /* BITS_PER_U32 - 1 */),
 	[TCA_FLOW_ADDEND]	= { .type = NLA_U32 },
 	[TCA_FLOW_MASK]		= { .type = NLA_U32 },
 	[TCA_FLOW_XOR]		= { .type = NLA_U32 },
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index deb0925f536d..48dd8c88903f 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -630,6 +630,63 @@ static bool cake_ddst(int flow_mode)
 	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
 }
 
+static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q,
+					     struct cake_flow *flow,
+					     int flow_mode)
+{
+	if (likely(cake_dsrc(flow_mode) &&
+		   q->hosts[flow->srchost].srchost_bulk_flow_count))
+		q->hosts[flow->srchost].srchost_bulk_flow_count--;
+}
+
+static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q,
+					     struct cake_flow *flow,
+					     int flow_mode)
+{
+	if (likely(cake_dsrc(flow_mode) &&
+		   q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES))
+		q->hosts[flow->srchost].srchost_bulk_flow_count++;
+}
+
+static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q,
+					     struct cake_flow *flow,
+					     int flow_mode)
+{
+	if (likely(cake_ddst(flow_mode) &&
+		   q->hosts[flow->dsthost].dsthost_bulk_flow_count))
+		q->hosts[flow->dsthost].dsthost_bulk_flow_count--;
+}
+
+static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q,
+					     struct cake_flow *flow,
+					     int flow_mode)
+{
+	if (likely(cake_ddst(flow_mode) &&
+		   q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES))
+		q->hosts[flow->dsthost].dsthost_bulk_flow_count++;
+}
+
+static u16 cake_get_flow_quantum(struct cake_tin_data *q,
+				 struct cake_flow *flow,
+				 int flow_mode)
+{
+	u16 host_load = 1;
+
+	if (cake_dsrc(flow_mode))
+		host_load = max(host_load,
+				q->hosts[flow->srchost].srchost_bulk_flow_count);
+
+	if (cake_ddst(flow_mode))
+		host_load = max(host_load,
+				q->hosts[flow->dsthost].dsthost_bulk_flow_count);
+
+	/* The get_random_u16() is a way to apply dithering to avoid
+	 * accumulating roundoff errors
+	 */
+	return (q->flow_quantum * quantum_div[host_load] +
+		get_random_u16()) >> 16;
+}
+
 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
 		     int flow_mode, u16 flow_override, u16 host_override)
 {
@@ -776,10 +833,8 @@ skip_hash:
 		allocate_dst = cake_ddst(flow_mode);
 
 		if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
-			if (allocate_src)
-				q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--;
-			if (allocate_dst)
-				q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--;
+			cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
+			cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
 		}
 found:
 		/* reserve queue for future packets in same flow */
@@ -804,9 +859,10 @@ found:
 			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
 found_src:
 			srchost_idx = outer_hash + k;
-			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
-				q->hosts[srchost_idx].srchost_bulk_flow_count++;
 			q->flows[reduced_hash].srchost = srchost_idx;
+
+			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
+				cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
 		}
 
 		if (allocate_dst) {
@@ -827,9 +883,10 @@ found_src:
 			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
 found_dst:
 			dsthost_idx = outer_hash + k;
-			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
-				q->hosts[dsthost_idx].dsthost_bulk_flow_count++;
 			q->flows[reduced_hash].dsthost = dsthost_idx;
+
+			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
+				cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
 		}
 	}
 
@@ -1841,10 +1898,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 
 	/* flowchain */
 	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
-		struct cake_host *srchost = &b->hosts[flow->srchost];
-		struct cake_host *dsthost = &b->hosts[flow->dsthost];
-		u16 host_load = 1;
-
 		if (!flow->set) {
 			list_add_tail(&flow->flowchain, &b->new_flows);
 		} else {
@@ -1854,18 +1907,8 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		flow->set = CAKE_SET_SPARSE;
 		b->sparse_flow_count++;
 
-		if (cake_dsrc(q->flow_mode))
-			host_load = max(host_load, srchost->srchost_bulk_flow_count);
-
-		if (cake_ddst(q->flow_mode))
-			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
-
-		flow->deficit = (b->flow_quantum *
-				 quantum_div[host_load]) >> 16;
+		flow->deficit = cake_get_flow_quantum(b, flow, q->flow_mode);
 	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
-		struct cake_host *srchost = &b->hosts[flow->srchost];
-		struct cake_host *dsthost = &b->hosts[flow->dsthost];
-
 		/* this flow was empty, accounted as a sparse flow, but actually
 		 * in the bulk rotation.
 		 */
@@ -1873,12 +1916,8 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		b->sparse_flow_count--;
 		b->bulk_flow_count++;
 
-		if (cake_dsrc(q->flow_mode))
-			srchost->srchost_bulk_flow_count++;
-
-		if (cake_ddst(q->flow_mode))
-			dsthost->dsthost_bulk_flow_count++;
-
+		cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode);
+		cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode);
 	}
 
 	if (q->buffer_used > q->buffer_max_used)
@@ -1935,14 +1974,12 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch)
 {
 	struct cake_sched_data *q = qdisc_priv(sch);
 	struct cake_tin_data *b = &q->tins[q->cur_tin];
-	struct cake_host *srchost, *dsthost;
 	enum skb_drop_reason reason;
 	ktime_t now = ktime_get();
 	struct cake_flow *flow;
 	struct list_head *head;
 	bool first_flow = true;
 	struct sk_buff *skb;
-	u16 host_load;
 	u64 delay;
 	u32 len;
 
@@ -2042,11 +2079,6 @@ retry:
 	q->cur_flow = flow - b->flows;
 	first_flow = false;
 
-	/* triple isolation (modified DRR++) */
-	srchost = &b->hosts[flow->srchost];
-	dsthost = &b->hosts[flow->dsthost];
-	host_load = 1;
-
 	/* flow isolation (DRR++) */
 	if (flow->deficit <= 0) {
 		/* Keep all flows with deficits out of the sparse and decaying
@@ -2058,11 +2090,8 @@ retry:
 				b->sparse_flow_count--;
 				b->bulk_flow_count++;
 
-				if (cake_dsrc(q->flow_mode))
-					srchost->srchost_bulk_flow_count++;
-
-				if (cake_ddst(q->flow_mode))
-					dsthost->dsthost_bulk_flow_count++;
+				cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode);
+				cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode);
 
 				flow->set = CAKE_SET_BULK;
 			} else {
@@ -2074,19 +2103,7 @@ retry:
 			}
 		}
 
-		if (cake_dsrc(q->flow_mode))
-			host_load = max(host_load, srchost->srchost_bulk_flow_count);
-
-		if (cake_ddst(q->flow_mode))
-			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
-
-		WARN_ON(host_load > CAKE_QUEUES);
-
-		/* The get_random_u16() is a way to apply dithering to avoid
-		 * accumulating roundoff errors
-		 */
-		flow->deficit += (b->flow_quantum * quantum_div[host_load] +
-				  get_random_u16()) >> 16;
+		flow->deficit += cake_get_flow_quantum(b, flow, q->flow_mode);
 		list_move_tail(&flow->flowchain, &b->old_flows);
 
 		goto retry;
@@ -2110,11 +2127,8 @@ retry:
 				if (flow->set == CAKE_SET_BULK) {
 					b->bulk_flow_count--;
 
-					if (cake_dsrc(q->flow_mode))
-						srchost->srchost_bulk_flow_count--;
-
-					if (cake_ddst(q->flow_mode))
-						dsthost->dsthost_bulk_flow_count--;
+					cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode);
+					cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode);
 
 					b->decaying_flow_count++;
 				} else if (flow->set == CAKE_SET_SPARSE ||
@@ -2132,12 +2146,8 @@ retry:
 				else if (flow->set == CAKE_SET_BULK) {
 					b->bulk_flow_count--;
 
-					if (cake_dsrc(q->flow_mode))
-						srchost->srchost_bulk_flow_count--;
-
-					if (cake_ddst(q->flow_mode))
-						dsthost->dsthost_bulk_flow_count--;
-
+					cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode);
+					cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode);
 				} else
 					b->decaying_flow_count--;
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 8874ae668095..14ab2f4c190a 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -551,25 +551,20 @@ static void dev_watchdog(struct timer_list *t)
 		netdev_put(dev, &dev->watchdog_dev_tracker);
 }
 
-void __netdev_watchdog_up(struct net_device *dev)
-{
-	if (dev->netdev_ops->ndo_tx_timeout) {
-		if (dev->watchdog_timeo <= 0)
-			dev->watchdog_timeo = 5*HZ;
-		if (!mod_timer(&dev->watchdog_timer,
-			       round_jiffies(jiffies + dev->watchdog_timeo)))
-			netdev_hold(dev, &dev->watchdog_dev_tracker,
-				    GFP_ATOMIC);
-	}
-}
-EXPORT_SYMBOL_GPL(__netdev_watchdog_up);
-
-static void dev_watchdog_up(struct net_device *dev)
+void netdev_watchdog_up(struct net_device *dev)
 {
-	__netdev_watchdog_up(dev);
+	if (!dev->netdev_ops->ndo_tx_timeout)
+		return;
+	if (dev->watchdog_timeo <= 0)
+		dev->watchdog_timeo = 5*HZ;
+	if (!mod_timer(&dev->watchdog_timer,
+		       round_jiffies(jiffies + dev->watchdog_timeo)))
+		netdev_hold(dev, &dev->watchdog_dev_tracker,
+			    GFP_ATOMIC);
 }
+EXPORT_SYMBOL_GPL(netdev_watchdog_up);
 
-static void dev_watchdog_down(struct net_device *dev)
+static void netdev_watchdog_down(struct net_device *dev)
 {
 	netif_tx_lock_bh(dev);
 	if (del_timer(&dev->watchdog_timer))
@@ -591,7 +586,7 @@ void netif_carrier_on(struct net_device *dev)
 		atomic_inc(&dev->carrier_up_count);
 		linkwatch_fire_event(dev);
 		if (netif_running(dev))
-			__netdev_watchdog_up(dev);
+			netdev_watchdog_up(dev);
 	}
 }
 EXPORT_SYMBOL(netif_carrier_on);
@@ -1267,7 +1262,7 @@ void dev_activate(struct net_device *dev)
 
 	if (need_watchdog) {
 		netif_trans_update(dev);
-		dev_watchdog_up(dev);
+		netdev_watchdog_up(dev);
 	}
 }
 EXPORT_SYMBOL(dev_activate);
@@ -1282,15 +1277,17 @@ static void qdisc_deactivate(struct Qdisc *qdisc)
 
 static void dev_deactivate_queue(struct net_device *dev,
 				 struct netdev_queue *dev_queue,
-				 void *_qdisc_default)
+				 void *_sync_needed)
 {
-	struct Qdisc *qdisc_default = _qdisc_default;
+	bool *sync_needed = _sync_needed;
 	struct Qdisc *qdisc;
 
 	qdisc = rtnl_dereference(dev_queue->qdisc);
 	if (qdisc) {
+		if (qdisc->enqueue)
+			*sync_needed = true;
 		qdisc_deactivate(qdisc);
-		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
+		rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
 	}
 }
 
@@ -1357,24 +1354,22 @@ static bool some_qdisc_is_busy(struct net_device *dev)
  */
 void dev_deactivate_many(struct list_head *head)
 {
+	bool sync_needed = false;
 	struct net_device *dev;
 
 	list_for_each_entry(dev, head, close_list) {
 		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
-					 &noop_qdisc);
+					 &sync_needed);
 		if (dev_ingress_queue(dev))
 			dev_deactivate_queue(dev, dev_ingress_queue(dev),
-					     &noop_qdisc);
+					     &sync_needed);
 
-		dev_watchdog_down(dev);
+		netdev_watchdog_down(dev);
 	}
 
-	/* Wait for outstanding qdisc-less dev_queue_xmit calls or
-	 * outstanding qdisc enqueuing calls.
-	 * This is avoided if all devices are in dismantle phase :
-	 * Caller will call synchronize_net() for us
-	 */
-	synchronize_net();
+	/* Wait for outstanding qdisc enqueuing calls. */
+	if (sync_needed)
+		synchronize_net();
 
 	list_for_each_entry(dev, head, close_list) {
 		netdev_for_each_tx_queue(dev, dev_reset_queue, NULL);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 8b9a1b96695e..29727ed1008e 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -43,6 +43,7 @@
 #include <net/addrconf.h>
 #include <net/inet_common.h>
 #include <net/inet_ecn.h>
+#include <net/inet_sock.h>
 #include <net/udp_tunnel.h>
 #include <net/inet_dscp.h>
 
@@ -427,16 +428,19 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
 	struct dst_entry *dst = NULL;
 	union sctp_addr *daddr = &t->ipaddr;
 	union sctp_addr dst_saddr;
-	u8 tos = READ_ONCE(inet_sk(sk)->tos);
+	dscp_t dscp;
 
 	if (t->dscp & SCTP_DSCP_SET_MASK)
-		tos = t->dscp & SCTP_DSCP_VAL_MASK;
+		dscp = inet_dsfield_to_dscp(t->dscp);
+	else
+		dscp = inet_sk_dscp(inet_sk(sk));
+
 	memset(&_fl, 0x0, sizeof(_fl));
 	fl4->daddr  = daddr->v4.sin_addr.s_addr;
 	fl4->fl4_dport = daddr->v4.sin_port;
 	fl4->flowi4_proto = IPPROTO_SCTP;
 	if (asoc) {
-		fl4->flowi4_tos = tos & INET_DSCP_MASK;
+		fl4->flowi4_tos = inet_dscp_to_dsfield(dscp);
 		fl4->flowi4_scope = ip_sock_rt_scope(asoc->base.sk);
 		fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if;
 		fl4->fl4_sport = htons(asoc->base.bind_addr.port);
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index e5a5af343c4c..8e1e97be4df7 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -387,7 +387,8 @@ static struct ctl_table sctp_net_table[] = {
 static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write,
 				 void *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct net *net = current->nsproxy->net_ns;
+	struct net *net = container_of(ctl->data, struct net,
+				       sctp.sctp_hmac_alg);
 	struct ctl_table tbl;
 	bool changed = false;
 	char *none = "none";
@@ -432,7 +433,7 @@ static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write,
 static int proc_sctp_do_rto_min(const struct ctl_table *ctl, int write,
 				void *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct net *net = current->nsproxy->net_ns;
+	struct net *net = container_of(ctl->data, struct net, sctp.rto_min);
 	unsigned int min = *(unsigned int *) ctl->extra1;
 	unsigned int max = *(unsigned int *) ctl->extra2;
 	struct ctl_table tbl;
@@ -460,7 +461,7 @@ static int proc_sctp_do_rto_min(const struct ctl_table *ctl, int write,
 static int proc_sctp_do_rto_max(const struct ctl_table *ctl, int write,
 				void *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct net *net = current->nsproxy->net_ns;
+	struct net *net = container_of(ctl->data, struct net, sctp.rto_max);
 	unsigned int min = *(unsigned int *) ctl->extra1;
 	unsigned int max = *(unsigned int *) ctl->extra2;
 	struct ctl_table tbl;
@@ -498,7 +499,7 @@ static int proc_sctp_do_alpha_beta(const struct ctl_table *ctl, int write,
 static int proc_sctp_do_auth(const struct ctl_table *ctl, int write,
 			     void *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct net *net = current->nsproxy->net_ns;
+	struct net *net = container_of(ctl->data, struct net, sctp.auth_enable);
 	struct ctl_table tbl;
 	int new_value, ret;
 
@@ -527,7 +528,7 @@ static int proc_sctp_do_auth(const struct ctl_table *ctl, int write,
 static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write,
 				 void *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct net *net = current->nsproxy->net_ns;
+	struct net *net = container_of(ctl->data, struct net, sctp.udp_port);
 	unsigned int min = *(unsigned int *)ctl->extra1;
 	unsigned int max = *(unsigned int *)ctl->extra2;
 	struct ctl_table tbl;
@@ -568,7 +569,8 @@ static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write,
 static int proc_sctp_do_probe_interval(const struct ctl_table *ctl, int write,
 				       void *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct net *net = current->nsproxy->net_ns;
+	struct net *net = container_of(ctl->data, struct net,
+				       sctp.probe_interval);
 	struct ctl_table tbl;
 	int ret, new_value;
 
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index c370efcfe3e8..ca6984541edb 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -2741,7 +2741,7 @@ int smc_accept(struct socket *sock, struct socket *new_sock,
 			release_sock(clcsk);
 		} else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
 			lock_sock(nsk);
-			smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
+			smc_rx_wait(smc_sk(nsk), &timeo, 0, smc_rx_data_available);
 			release_sock(nsk);
 		}
 	}
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index ccf57b7fe602..ac07b963aede 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -2155,7 +2155,7 @@ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb,
 		for_each_sg(buf_desc->sgt[lnk->link_idx].sgl, sg, nents, i) {
 			size = min_t(int, PAGE_SIZE - offset, buf_size);
 			sg_set_page(sg, vmalloc_to_page(buf), size, offset);
-			buf += size / sizeof(*buf);
+			buf += size;
 			buf_size -= size;
 			offset = 0;
 		}
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index f0cbe77a80b4..e7f1134453ef 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -197,7 +197,7 @@ static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len,
 			partial[i].offset = offset;
 			partial[i].len = size;
 			partial[i].private = (unsigned long)priv[i];
-			buf += size / sizeof(*buf);
+			buf += size;
 			left -= size;
 			offset = 0;
 		}
@@ -238,22 +238,23 @@ out:
 	return -ENOMEM;
 }
 
-static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn)
+static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn, size_t peeked)
 {
-	return atomic_read(&conn->bytes_to_rcv) &&
+	return smc_rx_data_available(conn, peeked) &&
 	       !atomic_read(&conn->splice_pending);
 }
 
 /* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted
  *   @smc    smc socket
  *   @timeo  pointer to max seconds to wait, pointer to value 0 for no timeout
+ *   @peeked  number of bytes already peeked
  *   @fcrit  add'l criterion to evaluate as function pointer
  * Returns:
  * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown.
  * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted).
  */
-int smc_rx_wait(struct smc_sock *smc, long *timeo,
-		int (*fcrit)(struct smc_connection *conn))
+int smc_rx_wait(struct smc_sock *smc, long *timeo, size_t peeked,
+		int (*fcrit)(struct smc_connection *conn, size_t baseline))
 {
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 	struct smc_connection *conn = &smc->conn;
@@ -262,7 +263,7 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo,
 	struct sock *sk = &smc->sk;
 	int rc;
 
-	if (fcrit(conn))
+	if (fcrit(conn, peeked))
 		return 1;
 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 	add_wait_queue(sk_sleep(sk), &wait);
@@ -271,7 +272,7 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo,
 			   cflags->peer_conn_abort ||
 			   READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN ||
 			   conn->killed ||
-			   fcrit(conn),
+			   fcrit(conn, peeked),
 			   &wait);
 	remove_wait_queue(sk_sleep(sk), &wait);
 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
@@ -322,11 +323,11 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len,
 	return -EAGAIN;
 }
 
-static bool smc_rx_recvmsg_data_available(struct smc_sock *smc)
+static bool smc_rx_recvmsg_data_available(struct smc_sock *smc, size_t peeked)
 {
 	struct smc_connection *conn = &smc->conn;
 
-	if (smc_rx_data_available(conn))
+	if (smc_rx_data_available(conn, peeked))
 		return true;
 	else if (conn->urg_state == SMC_URG_VALID)
 		/* we received a single urgent Byte - skip */
@@ -344,10 +345,10 @@ static bool smc_rx_recvmsg_data_available(struct smc_sock *smc)
 int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
 		   struct pipe_inode_info *pipe, size_t len, int flags)
 {
-	size_t copylen, read_done = 0, read_remaining = len;
+	size_t copylen, read_done = 0, read_remaining = len, peeked_bytes = 0;
 	size_t chunk_len, chunk_off, chunk_len_sum;
 	struct smc_connection *conn = &smc->conn;
-	int (*func)(struct smc_connection *conn);
+	int (*func)(struct smc_connection *conn, size_t baseline);
 	union smc_host_cursor cons;
 	int readable, chunk;
 	char *rcvbuf_base;
@@ -384,14 +385,14 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
 		if (conn->killed)
 			break;
 
-		if (smc_rx_recvmsg_data_available(smc))
+		if (smc_rx_recvmsg_data_available(smc, peeked_bytes))
 			goto copy;
 
 		if (sk->sk_shutdown & RCV_SHUTDOWN) {
 			/* smc_cdc_msg_recv_action() could have run after
 			 * above smc_rx_recvmsg_data_available()
 			 */
-			if (smc_rx_recvmsg_data_available(smc))
+			if (smc_rx_recvmsg_data_available(smc, peeked_bytes))
 				goto copy;
 			break;
 		}
@@ -425,26 +426,28 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
 			}
 		}
 
-		if (!smc_rx_data_available(conn)) {
-			smc_rx_wait(smc, &timeo, smc_rx_data_available);
+		if (!smc_rx_data_available(conn, peeked_bytes)) {
+			smc_rx_wait(smc, &timeo, peeked_bytes, smc_rx_data_available);
 			continue;
 		}
 
 copy:
 		/* initialize variables for 1st iteration of subsequent loop */
 		/* could be just 1 byte, even after waiting on data above */
-		readable = atomic_read(&conn->bytes_to_rcv);
+		readable = smc_rx_data_available(conn, peeked_bytes);
 		splbytes = atomic_read(&conn->splice_pending);
 		if (!readable || (msg && splbytes)) {
 			if (splbytes)
 				func = smc_rx_data_available_and_no_splice_pend;
 			else
 				func = smc_rx_data_available;
-			smc_rx_wait(smc, &timeo, func);
+			smc_rx_wait(smc, &timeo, peeked_bytes, func);
 			continue;
 		}
 
 		smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
+		if ((flags & MSG_PEEK) && peeked_bytes)
+			smc_curs_add(conn->rmb_desc->len, &cons, peeked_bytes);
 		/* subsequent splice() calls pick up where previous left */
 		if (splbytes)
 			smc_curs_add(conn->rmb_desc->len, &cons, splbytes);
@@ -480,6 +483,8 @@ copy:
 			}
 			read_remaining -= chunk_len;
 			read_done += chunk_len;
+			if (flags & MSG_PEEK)
+				peeked_bytes += chunk_len;
 
 			if (chunk_len_sum == copylen)
 				break; /* either on 1st or 2nd iteration */
diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h
index db823c97d824..994f5e42d1ba 100644
--- a/net/smc/smc_rx.h
+++ b/net/smc/smc_rx.h
@@ -21,11 +21,11 @@ void smc_rx_init(struct smc_sock *smc);
 
 int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
 		   struct pipe_inode_info *pipe, size_t len, int flags);
-int smc_rx_wait(struct smc_sock *smc, long *timeo,
-		int (*fcrit)(struct smc_connection *conn));
-static inline int smc_rx_data_available(struct smc_connection *conn)
+int smc_rx_wait(struct smc_sock *smc, long *timeo, size_t peeked,
+		int (*fcrit)(struct smc_connection *conn, size_t baseline));
+static inline int smc_rx_data_available(struct smc_connection *conn, size_t peeked)
 {
-	return atomic_read(&conn->bytes_to_rcv);
+	return atomic_read(&conn->bytes_to_rcv) - peeked;
 }
 
 #endif /* SMC_RX_H */
diff --git a/net/socket.c b/net/socket.c
index 16402b8be5a7..4afe31656a2b 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -110,6 +110,8 @@
 #include <linux/ptp_clock_kernel.h>
 #include <trace/events/sock.h>
 
+#include "core/dev.h"
+
 #ifdef CONFIG_NET_RX_BUSY_POLL
 unsigned int sysctl_net_busy_read __read_mostly;
 unsigned int sysctl_net_busy_poll __read_mostly;
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 9ee5a83c5b40..99ca4465f702 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -737,6 +737,10 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval,
 	else
 		ctx->rx_conf = conf;
 	update_sk_prot(sk, ctx);
+
+	if (update)
+		return 0;
+
 	if (tx) {
 		ctx->sk_write_space = sk->sk_write_space;
 		sk->sk_write_space = tls_write_space;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 47550d485819..914d4e1516a3 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -458,7 +458,7 @@ int tls_tx_records(struct sock *sk, int flags)
 
 tx_err:
 	if (rc < 0 && rc != -EAGAIN)
-		tls_err_abort(sk, -EBADMSG);
+		tls_err_abort(sk, rc);
 
 	return rc;
 }
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index d7b16f2c23e9..f0157702718f 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -135,6 +135,22 @@ config NET_KEY_MIGRATE
 
 	  If unsure, say N.
 
+config XFRM_IPTFS
+	tristate "IPsec IP-TFS/AGGFRAG (RFC 9347) encapsulation support"
+	depends on XFRM
+	help
+	  Information on the IP-TFS/AGGFRAG encapsulation can be found
+	  in RFC 9347. This feature supports demand driven (i.e.,
+	  non-constant send rate) IP-TFS to take advantage of the
+	  AGGFRAG ESP payload encapsulation. This payload type
+	  supports aggregation and fragmentation of the inner IP
+	  packet stream which in turn yields higher small-packet
+	  bandwidth as well as reducing MTU/PMTU issues. Congestion
+	  control is unimplementated as the send rate is demand driven
+	  rather than constant.
+
+	  If unsure, say N.
+
 config XFRM_ESPINTCP
 	bool
 
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index 512e0b2f8514..5a1787587cb3 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -21,5 +21,6 @@ obj-$(CONFIG_XFRM_USER) += xfrm_user.o
 obj-$(CONFIG_XFRM_USER_COMPAT) += xfrm_compat.o
 obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o
 obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o
+obj-$(CONFIG_XFRM_IPTFS) += xfrm_iptfs.o
 obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o
 obj-$(CONFIG_DEBUG_INFO_BTF) += xfrm_state_bpf.o
diff --git a/net/xfrm/trace_iptfs.h b/net/xfrm/trace_iptfs.h
new file mode 100644
index 000000000000..74391ba24445
--- /dev/null
+++ b/net/xfrm/trace_iptfs.h
@@ -0,0 +1,218 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* xfrm_trace_iptfs.h
+ *
+ * August 12 2023, Christian Hopps <chopps@labn.net>
+ *
+ * Copyright (c) 2023, LabN Consulting, L.L.C.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM iptfs
+
+#if !defined(_TRACE_IPTFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IPTFS_H
+
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/tracepoint.h>
+#include <net/ip.h>
+
+struct xfrm_iptfs_data;
+
+TRACE_EVENT(iptfs_egress_recv,
+	    TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u16 blkoff),
+	    TP_ARGS(skb, xtfs, blkoff),
+	    TP_STRUCT__entry(__field(struct sk_buff *, skb)
+			     __field(void *, head)
+			     __field(void *, head_pg_addr)
+			     __field(void *, pg0addr)
+			     __field(u32, skb_len)
+			     __field(u32, data_len)
+			     __field(u32, headroom)
+			     __field(u32, tailroom)
+			     __field(u32, tail)
+			     __field(u32, end)
+			     __field(u32, pg0off)
+			     __field(u8, head_frag)
+			     __field(u8, frag_list)
+			     __field(u8, nr_frags)
+			     __field(u16, blkoff)),
+	    TP_fast_assign(__entry->skb = skb;
+			   __entry->head = skb->head;
+			   __entry->skb_len = skb->len;
+			   __entry->data_len = skb->data_len;
+			   __entry->headroom = skb_headroom(skb);
+			   __entry->tailroom = skb_tailroom(skb);
+			   __entry->tail = (u32)skb->tail;
+			   __entry->end = (u32)skb->end;
+			   __entry->head_frag = skb->head_frag;
+			   __entry->frag_list = (bool)skb_shinfo(skb)->frag_list;
+			   __entry->nr_frags = skb_shinfo(skb)->nr_frags;
+			   __entry->blkoff = blkoff;
+			   __entry->head_pg_addr = page_address(virt_to_head_page(skb->head));
+			   __entry->pg0addr = (__entry->nr_frags
+					       ? page_address(netmem_to_page(skb_shinfo(skb)->frags[0].netmem))
+					       : NULL);
+			   __entry->pg0off = (__entry->nr_frags
+					      ? skb_shinfo(skb)->frags[0].offset
+					      : 0);
+		    ),
+	    TP_printk("EGRESS: skb=%p len=%u data_len=%u headroom=%u head_frag=%u frag_list=%u nr_frags=%u blkoff=%u\n\t\ttailroom=%u tail=%u end=%u head=%p hdpgaddr=%p pg0->addr=%p pg0->data=%p pg0->off=%u",
+		      __entry->skb, __entry->skb_len, __entry->data_len, __entry->headroom,
+		      __entry->head_frag, __entry->frag_list, __entry->nr_frags, __entry->blkoff,
+		      __entry->tailroom, __entry->tail, __entry->end, __entry->head,
+		      __entry->head_pg_addr, __entry->pg0addr, __entry->pg0addr + __entry->pg0off,
+		      __entry->pg0off)
+	)
+
+DECLARE_EVENT_CLASS(iptfs_ingress_preq_event,
+		    TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs,
+			     u32 pmtu, u8 was_gso),
+		    TP_ARGS(skb, xtfs, pmtu, was_gso),
+		    TP_STRUCT__entry(__field(struct sk_buff *, skb)
+				     __field(u32, skb_len)
+				     __field(u32, data_len)
+				     __field(u32, pmtu)
+				     __field(u32, queue_size)
+				     __field(u32, proto_seq)
+				     __field(u8, proto)
+				     __field(u8, was_gso)
+			    ),
+		    TP_fast_assign(__entry->skb = skb;
+				   __entry->skb_len = skb->len;
+				   __entry->data_len = skb->data_len;
+				   __entry->queue_size =
+					xtfs->cfg.max_queue_size - xtfs->queue_size;
+				   __entry->proto = __trace_ip_proto(ip_hdr(skb));
+				   __entry->proto_seq = __trace_ip_proto_seq(ip_hdr(skb));
+				   __entry->pmtu = pmtu;
+				   __entry->was_gso = was_gso;
+			    ),
+		    TP_printk("INGRPREQ: skb=%p len=%u data_len=%u qsize=%u proto=%u proto_seq=%u pmtu=%u was_gso=%u",
+			      __entry->skb, __entry->skb_len, __entry->data_len,
+			      __entry->queue_size, __entry->proto, __entry->proto_seq,
+			      __entry->pmtu, __entry->was_gso));
+
+DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_enqueue,
+	     TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso),
+	     TP_ARGS(skb, xtfs, pmtu, was_gso));
+
+DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_no_queue_space,
+	     TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso),
+	     TP_ARGS(skb, xtfs, pmtu, was_gso));
+
+DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_too_big,
+	     TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso),
+	     TP_ARGS(skb, xtfs, pmtu, was_gso));
+
+DECLARE_EVENT_CLASS(iptfs_ingress_postq_event,
+		    TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, struct iphdr *iph),
+		    TP_ARGS(skb, mtu, blkoff, iph),
+		    TP_STRUCT__entry(__field(struct sk_buff *, skb)
+				     __field(u32, skb_len)
+				     __field(u32, data_len)
+				     __field(u32, mtu)
+				     __field(u32, proto_seq)
+				     __field(u16, blkoff)
+				     __field(u8, proto)),
+		    TP_fast_assign(__entry->skb = skb;
+				   __entry->skb_len = skb->len;
+				   __entry->data_len = skb->data_len;
+				   __entry->mtu = mtu;
+				   __entry->blkoff = blkoff;
+				   __entry->proto = iph ? __trace_ip_proto(iph) : 0;
+				   __entry->proto_seq = iph ? __trace_ip_proto_seq(iph) : 0;
+			    ),
+		    TP_printk("INGRPSTQ: skb=%p len=%u data_len=%u mtu=%u blkoff=%u proto=%u proto_seq=%u",
+			      __entry->skb, __entry->skb_len, __entry->data_len, __entry->mtu,
+			      __entry->blkoff, __entry->proto, __entry->proto_seq));
+
+DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_dequeue,
+	     TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff,
+		      struct iphdr *iph),
+	     TP_ARGS(skb, mtu, blkoff, iph));
+
+DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_fragmenting,
+	     TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff,
+		      struct iphdr *iph),
+	     TP_ARGS(skb, mtu, blkoff, iph));
+
+DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_final_fragment,
+	     TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff,
+		      struct iphdr *iph),
+	     TP_ARGS(skb, mtu, blkoff, iph));
+
+DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_toobig,
+	     TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff,
+		      struct iphdr *iph),
+	     TP_ARGS(skb, mtu, blkoff, iph));
+
+TRACE_EVENT(iptfs_ingress_nth_peek,
+	    TP_PROTO(struct sk_buff *skb, u32 remaining),
+	    TP_ARGS(skb, remaining),
+	    TP_STRUCT__entry(__field(struct sk_buff *, skb)
+			     __field(u32, skb_len)
+			     __field(u32, remaining)),
+	    TP_fast_assign(__entry->skb = skb;
+			   __entry->skb_len = skb->len;
+			   __entry->remaining = remaining;
+		    ),
+	    TP_printk("INGRPSTQ: NTHPEEK: skb=%p len=%u remaining=%u",
+		      __entry->skb, __entry->skb_len, __entry->remaining));
+
+TRACE_EVENT(iptfs_ingress_nth_add, TP_PROTO(struct sk_buff *skb, u8 share_ok),
+	    TP_ARGS(skb, share_ok),
+	    TP_STRUCT__entry(__field(struct sk_buff *, skb)
+			     __field(u32, skb_len)
+			     __field(u32, data_len)
+			     __field(u8, share_ok)
+			     __field(u8, head_frag)
+			     __field(u8, pp_recycle)
+			     __field(u8, cloned)
+			     __field(u8, shared)
+			     __field(u8, nr_frags)
+			     __field(u8, frag_list)
+		    ),
+	    TP_fast_assign(__entry->skb = skb;
+			   __entry->skb_len = skb->len;
+			   __entry->data_len = skb->data_len;
+			   __entry->share_ok = share_ok;
+			   __entry->head_frag = skb->head_frag;
+			   __entry->pp_recycle = skb->pp_recycle;
+			   __entry->cloned = skb_cloned(skb);
+			   __entry->shared = skb_shared(skb);
+			   __entry->nr_frags = skb_shinfo(skb)->nr_frags;
+			   __entry->frag_list = (bool)skb_shinfo(skb)->frag_list;
+		    ),
+	    TP_printk("INGRPSTQ: NTHADD: skb=%p len=%u data_len=%u share_ok=%u head_frag=%u pp_recycle=%u cloned=%u shared=%u nr_frags=%u frag_list=%u",
+		      __entry->skb, __entry->skb_len, __entry->data_len, __entry->share_ok,
+		      __entry->head_frag, __entry->pp_recycle, __entry->cloned, __entry->shared,
+		      __entry->nr_frags, __entry->frag_list));
+
+DECLARE_EVENT_CLASS(iptfs_timer_event,
+		    TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val),
+		    TP_ARGS(xtfs, time_val),
+		    TP_STRUCT__entry(__field(u64, time_val)
+				     __field(u64, set_time)),
+		    TP_fast_assign(__entry->time_val = time_val;
+				   __entry->set_time = xtfs->iptfs_settime;
+			    ),
+		    TP_printk("TIMER: set_time=%llu time_val=%llu",
+			      __entry->set_time, __entry->time_val));
+
+DEFINE_EVENT(iptfs_timer_event, iptfs_timer_start,
+	     TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val),
+	     TP_ARGS(xtfs, time_val));
+
+DEFINE_EVENT(iptfs_timer_event, iptfs_timer_expire,
+	     TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val),
+	     TP_ARGS(xtfs, time_val));
+
+#endif /* _TRACE_IPTFS_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../net/xfrm
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace_iptfs
+#include <trace/define_trace.h>
diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c
index 5b9ee63e30b6..b8d2e6930041 100644
--- a/net/xfrm/xfrm_compat.c
+++ b/net/xfrm/xfrm_compat.c
@@ -284,9 +284,15 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src)
 	case XFRMA_SA_DIR:
 	case XFRMA_NAT_KEEPALIVE_INTERVAL:
 	case XFRMA_SA_PCPU:
+	case XFRMA_IPTFS_DROP_TIME:
+	case XFRMA_IPTFS_REORDER_WINDOW:
+	case XFRMA_IPTFS_DONT_FRAG:
+	case XFRMA_IPTFS_INIT_DELAY:
+	case XFRMA_IPTFS_MAX_QSIZE:
+	case XFRMA_IPTFS_PKT_SIZE:
 		return xfrm_nla_cpy(dst, src, nla_len(src));
 	default:
-		BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_PCPU);
+		BUILD_BUG_ON(XFRMA_MAX != XFRMA_IPTFS_PKT_SIZE);
 		pr_warn_once("unsupported nla_type %d\n", src->nla_type);
 		return -EOPNOTSUPP;
 	}
@@ -441,7 +447,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla,
 	int err;
 
 	if (type > XFRMA_MAX) {
-		BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_PCPU);
+		BUILD_BUG_ON(XFRMA_MAX != XFRMA_IPTFS_PKT_SIZE);
 		NL_SET_ERR_MSG(extack, "Bad attribute");
 		return -EOPNOTSUPP;
 	}
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index b33c4591e09a..d1fa94e52cea 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -42,7 +42,8 @@ static void __xfrm_mode_tunnel_prep(struct xfrm_state *x, struct sk_buff *skb,
 		skb->transport_header = skb->network_header + hsize;
 
 	skb_reset_mac_len(skb);
-	pskb_pull(skb, skb->mac_len + x->props.header_len);
+	pskb_pull(skb,
+		  skb->mac_len + x->props.header_len - x->props.enc_hdr_len);
 }
 
 static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb,
@@ -68,6 +69,7 @@ static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb,
 static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb)
 {
 	switch (x->outer_mode.encap) {
+	case XFRM_MODE_IPTFS:
 	case XFRM_MODE_TUNNEL:
 		if (x->outer_mode.family == AF_INET)
 			return __xfrm_mode_tunnel_prep(x, skb,
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 841a60a6fbfe..7e6a71b9d6a3 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -48,7 +48,7 @@ static DEFINE_SPINLOCK(xfrm_input_afinfo_lock);
 static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[2][AF_INET6 + 1];
 
 static struct gro_cells gro_cells;
-static struct net_device xfrm_napi_dev;
+static struct net_device *xfrm_napi_dev;
 
 static DEFINE_PER_CPU(struct xfrm_trans_tasklet, xfrm_trans_tasklet);
 
@@ -446,6 +446,9 @@ static int xfrm_inner_mode_input(struct xfrm_state *x,
 		WARN_ON_ONCE(1);
 		break;
 	default:
+		if (x->mode_cbs && x->mode_cbs->input)
+			return x->mode_cbs->input(x, skb);
+
 		WARN_ON_ONCE(1);
 		break;
 	}
@@ -453,6 +456,10 @@ static int xfrm_inner_mode_input(struct xfrm_state *x,
 	return -EOPNOTSUPP;
 }
 
+/* NOTE: encap_type - In addition to the normal (non-negative) values for
+ * encap_type, a negative value of -1 or -2 can be used to resume/restart this
+ * function after a previous invocation early terminated for async operation.
+ */
 int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 {
 	const struct xfrm_state_afinfo *afinfo;
@@ -489,6 +496,10 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 
 		family = x->props.family;
 
+		/* An encap_type of -2 indicates reconstructed inner packet */
+		if (encap_type == -2)
+			goto resume_decapped;
+
 		/* An encap_type of -1 indicates async resumption. */
 		if (encap_type == -1) {
 			async = 1;
@@ -679,11 +690,14 @@ resume:
 
 		XFRM_MODE_SKB_CB(skb)->protocol = nexthdr;
 
-		if (xfrm_inner_mode_input(x, skb)) {
+		err = xfrm_inner_mode_input(x, skb);
+		if (err == -EINPROGRESS)
+			return 0;
+		else if (err) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR);
 			goto drop;
 		}
-
+resume_decapped:
 		if (x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL) {
 			decaps = 1;
 			break;
@@ -811,8 +825,11 @@ void __init xfrm_input_init(void)
 	int err;
 	int i;
 
-	init_dummy_netdev(&xfrm_napi_dev);
-	err = gro_cells_init(&gro_cells, &xfrm_napi_dev);
+	xfrm_napi_dev = alloc_netdev_dummy(0);
+	if (!xfrm_napi_dev)
+		panic("Failed to allocate XFRM dummy netdev\n");
+
+	err = gro_cells_init(&gro_cells, xfrm_napi_dev);
 	if (err)
 		gro_cells.cells = NULL;
 
diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c
new file mode 100644
index 000000000000..755f1eea8bfa
--- /dev/null
+++ b/net/xfrm/xfrm_iptfs.c
@@ -0,0 +1,2764 @@
+// SPDX-License-Identifier: GPL-2.0
+/* xfrm_iptfs: IPTFS encapsulation support
+ *
+ * April 21 2022, Christian Hopps <chopps@labn.net>
+ *
+ * Copyright (c) 2022, LabN Consulting, L.L.C.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/icmpv6.h>
+#include <linux/skbuff_ref.h>
+#include <net/gro.h>
+#include <net/icmp.h>
+#include <net/ip6_route.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+
+#include <crypto/aead.h>
+
+#include "xfrm_inout.h"
+#include "trace_iptfs.h"
+
+/* IPTFS encap (header) values. */
+#define IPTFS_SUBTYPE_BASIC 0
+#define IPTFS_SUBTYPE_CC 1
+
+/* ----------------------------------------------- */
+/* IP-TFS default SA values (tunnel egress/dir-in) */
+/* ----------------------------------------------- */
+
+/**
+ * define IPTFS_DEFAULT_DROP_TIME_USECS - default drop time
+ *
+ * The default IPTFS drop time in microseconds. The drop time is the amount of
+ * time before a missing out-of-order IPTFS tunnel packet is considered lost.
+ * See also the reorder window.
+ *
+ * Default 1s.
+ */
+#define IPTFS_DEFAULT_DROP_TIME_USECS 1000000
+
+/**
+ * define IPTFS_DEFAULT_REORDER_WINDOW - default reorder window size
+ *
+ * The default IPTFS reorder window size. The reorder window size dictates the
+ * maximum number of IPTFS tunnel packets in a sequence that may arrive out of
+ * order.
+ *
+ * Default 3. (tcp folks suggested)
+ */
+#define IPTFS_DEFAULT_REORDER_WINDOW 3
+
+/* ------------------------------------------------ */
+/* IPTFS default SA values (tunnel ingress/dir-out) */
+/* ------------------------------------------------ */
+
+/**
+ * define IPTFS_DEFAULT_INIT_DELAY_USECS - default initial output delay
+ *
+ * The initial output delay is the amount of time prior to servicing the output
+ * queue after queueing the first packet on said queue. This applies anytime the
+ * output queue was previously empty.
+ *
+ * Default 0.
+ */
+#define IPTFS_DEFAULT_INIT_DELAY_USECS 0
+
+/**
+ * define IPTFS_DEFAULT_MAX_QUEUE_SIZE - default max output queue size.
+ *
+ * The default IPTFS max output queue size in octets. The output queue is where
+ * received packets destined for output over an IPTFS tunnel are stored prior to
+ * being output in aggregated/fragmented form over the IPTFS tunnel.
+ *
+ * Default 1M.
+ */
+#define IPTFS_DEFAULT_MAX_QUEUE_SIZE (1024 * 10240)
+
+/* Assumed: skb->head is cache aligned.
+ *
+ * L2 Header resv: Arrange for cacheline to start at skb->data - 16 to keep the
+ * to-be-pushed L2 header in the same cacheline as resulting `skb->data` (i.e.,
+ * the L3 header). If cacheline size is > 64 then skb->data + pushed L2 will all
+ * be in a single cacheline if we simply reserve 64 bytes.
+ *
+ * L3 Header resv: For L3+L2 headers (i.e., skb->data points at the IPTFS payload)
+ * we want `skb->data` to be cacheline aligned and all pushed L2L3 headers will
+ * be in their own cacheline[s]. 128 works for cachelins up to 128 bytes, for
+ * any larger cacheline sizes the pushed headers will simply share the cacheline
+ * with the start of the IPTFS payload (skb->data).
+ */
+#define XFRM_IPTFS_MIN_L3HEADROOM 128
+#define XFRM_IPTFS_MIN_L2HEADROOM (L1_CACHE_BYTES > 64 ? 64 : 64 + 16)
+
+/* Min to try to share outer iptfs skb data vs copying into new skb */
+#define IPTFS_PKT_SHARE_MIN 129
+
+#define NSECS_IN_USEC 1000
+
+#define IPTFS_HRTIMER_MODE HRTIMER_MODE_REL_SOFT
+
+/**
+ * struct xfrm_iptfs_config - configuration for the IPTFS tunnel.
+ * @pkt_size: size of the outer IP packet. 0 to use interface and MTU discovery,
+ *	otherwise the user specified value.
+ * @max_queue_size: The maximum number of octets allowed to be queued to be sent
+ *	over the IPTFS SA. The queue size is measured as the size of all the
+ *	packets enqueued.
+ * @reorder_win_size: the number slots in the reorder window, thus the number of
+ *	packets that may arrive out of order.
+ * @dont_frag: true to inhibit fragmenting across IPTFS outer packets.
+ */
+struct xfrm_iptfs_config {
+	u32 pkt_size;	    /* outer_packet_size or 0 */
+	u32 max_queue_size; /* octets */
+	u16 reorder_win_size;
+	u8 dont_frag : 1;
+};
+
+struct skb_wseq {
+	struct sk_buff *skb;
+	u64 drop_time;
+};
+
+/**
+ * struct xfrm_iptfs_data - mode specific xfrm state.
+ * @cfg: IPTFS tunnel config.
+ * @x: owning SA (xfrm_state).
+ * @queue: queued user packets to send.
+ * @queue_size: number of octets on queue (sum of packet sizes).
+ * @ecn_queue_size: octets above with ECN mark.
+ * @init_delay_ns: nanoseconds to wait to send initial IPTFS packet.
+ * @iptfs_timer: output timer.
+ * @iptfs_settime: time the output timer was set.
+ * @payload_mtu: max payload size.
+ * @w_seq_set: true after first seq received.
+ * @w_wantseq: waiting for this seq number as next to process (in order).
+ * @w_saved: the saved buf array (reorder window).
+ * @w_savedlen: the saved len (not size).
+ * @drop_lock: lock to protect reorder queue.
+ * @drop_timer: timer for considering next packet lost.
+ * @drop_time_ns: timer intervan in nanoseconds.
+ * @ra_newskb: new pkt being reassembled.
+ * @ra_wantseq: expected next sequence for reassembly.
+ * @ra_runt: last pkt bytes from very end of last skb.
+ * @ra_runtlen: size of ra_runt.
+ */
+struct xfrm_iptfs_data {
+	struct xfrm_iptfs_config cfg;
+
+	/* Ingress User Input */
+	struct xfrm_state *x;	   /* owning state */
+	struct sk_buff_head queue; /* output queue */
+
+	u32 queue_size;		    /* octets */
+	u32 ecn_queue_size;	    /* octets above which ECN mark */
+	u64 init_delay_ns;	    /* nanoseconds */
+	struct hrtimer iptfs_timer; /* output timer */
+	time64_t iptfs_settime;	    /* time timer was set */
+	u32 payload_mtu;	    /* max payload size */
+
+	/* Tunnel input reordering */
+	bool w_seq_set;		  /* true after first seq received */
+	u64 w_wantseq;		  /* expected next sequence */
+	struct skb_wseq *w_saved; /* the saved buf array */
+	u32 w_savedlen;		  /* the saved len (not size) */
+	spinlock_t drop_lock;
+	struct hrtimer drop_timer;
+	u64 drop_time_ns;
+
+	/* Tunnel input reassembly */
+	struct sk_buff *ra_newskb; /* new pkt being reassembled */
+	u64 ra_wantseq;		   /* expected next sequence */
+	u8 ra_runt[6];		   /* last pkt bytes from last skb */
+	u8 ra_runtlen;		   /* count of ra_runt */
+};
+
+static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu);
+static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me);
+static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me);
+
+/* ================= */
+/* Utility Functions */
+/* ================= */
+
+#ifdef TRACEPOINTS_ENABLED
+static u32 __trace_ip_proto(struct iphdr *iph)
+{
+	if (iph->version == 4)
+		return iph->protocol;
+	return ((struct ipv6hdr *)iph)->nexthdr;
+}
+
+static u32 __trace_ip_proto_seq(struct iphdr *iph)
+{
+	void *nexthdr;
+	u32 protocol = 0;
+
+	if (iph->version == 4) {
+		nexthdr = (void *)(iph + 1);
+		protocol = iph->protocol;
+	} else if (iph->version == 6) {
+		nexthdr = (void *)(((struct ipv6hdr *)(iph)) + 1);
+		protocol = ((struct ipv6hdr *)(iph))->nexthdr;
+	}
+	switch (protocol) {
+	case IPPROTO_ICMP:
+		return ntohs(((struct icmphdr *)nexthdr)->un.echo.sequence);
+	case IPPROTO_ICMPV6:
+		return ntohs(((struct icmp6hdr *)nexthdr)->icmp6_sequence);
+	case IPPROTO_TCP:
+		return ntohl(((struct tcphdr *)nexthdr)->seq);
+	case IPPROTO_UDP:
+		return ntohs(((struct udphdr *)nexthdr)->source);
+	default:
+		return 0;
+	}
+}
+#endif /*TRACEPOINTS_ENABLED*/
+
+static u64 __esp_seq(struct sk_buff *skb)
+{
+	u64 seq = ntohl(XFRM_SKB_CB(skb)->seq.input.low);
+
+	return seq | (u64)ntohl(XFRM_SKB_CB(skb)->seq.input.hi) << 32;
+}
+
+/* ======================= */
+/* IPTFS SK_BUFF Functions */
+/* ======================= */
+
+/**
+ * iptfs_alloc_skb() - Allocate a new `skb`.
+ * @tpl: the skb to copy required meta-data from.
+ * @len: the linear length of the head data, zero is fine.
+ * @l3resv: true if skb reserve needs to support pushing L3 headers
+ *
+ * A new `skb` is allocated and required meta-data is copied from `tpl`, the
+ * head data is sized to `len` + reserved space set according to the @l3resv
+ * boolean.
+ *
+ * When @l3resv is false, resv is XFRM_IPTFS_MIN_L2HEADROOM which arranges for
+ * `skb->data - 16`  which is a good guess for good cache alignment (placing the
+ * to be pushed L2 header at the start of a cacheline.
+ *
+ * Otherwise, @l3resv is true and resv is set to the correct reserved space for
+ * dst->dev plus the calculated L3 overhead for the xfrm dst or
+ * XFRM_IPTFS_MIN_L3HEADROOM whichever is larger. This is then cache aligned so
+ * that all the headers will commonly fall in a cacheline when possible.
+ *
+ * l3resv=true is used on tunnel ingress (tx), because we need to reserve for
+ * the new IPTFS packet (i.e., L2+L3 headers). On tunnel egress (rx) the data
+ * being copied into the skb includes the user L3 headers already so we only
+ * need to reserve for L2.
+ *
+ * Return: the new skb or NULL.
+ */
+static struct sk_buff *iptfs_alloc_skb(struct sk_buff *tpl, u32 len, bool l3resv)
+{
+	struct sk_buff *skb;
+	u32 resv;
+
+	if (!l3resv) {
+		resv = XFRM_IPTFS_MIN_L2HEADROOM;
+	} else {
+		struct dst_entry *dst = skb_dst(tpl);
+
+		resv = LL_RESERVED_SPACE(dst->dev) + dst->header_len;
+		resv = max(resv, XFRM_IPTFS_MIN_L3HEADROOM);
+		resv = L1_CACHE_ALIGN(resv);
+	}
+
+	skb = alloc_skb(len + resv, GFP_ATOMIC | __GFP_NOWARN);
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, resv);
+
+	if (!l3resv) {
+		/* xfrm_input resume needs dev and xfrm ext from tunnel pkt */
+		skb->dev = tpl->dev;
+		__skb_ext_copy(skb, tpl);
+	}
+
+	/* dropped by xfrm_input, used by xfrm_output */
+	skb_dst_copy(skb, tpl);
+
+	return skb;
+}
+
+/**
+ * iptfs_skb_head_to_frag() - initialize a skb_frag_t based on skb head data
+ * @skb: skb with the head data
+ * @frag: frag to initialize
+ */
+static void iptfs_skb_head_to_frag(const struct sk_buff *skb, skb_frag_t *frag)
+{
+	struct page *page = virt_to_head_page(skb->data);
+	unsigned char *addr = (unsigned char *)page_address(page);
+
+	skb_frag_fill_page_desc(frag, page, skb->data - addr, skb_headlen(skb));
+}
+
+/**
+ * struct iptfs_skb_frag_walk - use to track a walk through fragments
+ * @fragi: current fragment index
+ * @past: length of data in fragments before @fragi
+ * @total: length of data in all fragments
+ * @nr_frags: number of fragments present in array
+ * @initial_offset: the value passed in to skb_prepare_frag_walk()
+ * @frags: the page fragments inc. room for head page
+ * @pp_recycle: copy of skb->pp_recycle
+ */
+struct iptfs_skb_frag_walk {
+	u32 fragi;
+	u32 past;
+	u32 total;
+	u32 nr_frags;
+	u32 initial_offset;
+	skb_frag_t frags[MAX_SKB_FRAGS + 1];
+	bool pp_recycle;
+};
+
+/**
+ * iptfs_skb_prepare_frag_walk() - initialize a frag walk over an skb.
+ * @skb: the skb to walk.
+ * @initial_offset: start the walk @initial_offset into the skb.
+ * @walk: the walk to initialize
+ *
+ * Future calls to skb_add_frags() will expect the @offset value to be at
+ * least @initial_offset large.
+ */
+static void iptfs_skb_prepare_frag_walk(struct sk_buff *skb, u32 initial_offset,
+					struct iptfs_skb_frag_walk *walk)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+	skb_frag_t *frag, *from;
+	u32 i;
+
+	walk->initial_offset = initial_offset;
+	walk->fragi = 0;
+	walk->past = 0;
+	walk->total = 0;
+	walk->nr_frags = 0;
+	walk->pp_recycle = skb->pp_recycle;
+
+	if (skb->head_frag) {
+		if (initial_offset >= skb_headlen(skb)) {
+			initial_offset -= skb_headlen(skb);
+		} else {
+			frag = &walk->frags[walk->nr_frags++];
+			iptfs_skb_head_to_frag(skb, frag);
+			frag->offset += initial_offset;
+			frag->len -= initial_offset;
+			walk->total += frag->len;
+			initial_offset = 0;
+		}
+	} else {
+		initial_offset -= skb_headlen(skb);
+	}
+
+	for (i = 0; i < shinfo->nr_frags; i++) {
+		from = &shinfo->frags[i];
+		if (initial_offset >= from->len) {
+			initial_offset -= from->len;
+			continue;
+		}
+		frag = &walk->frags[walk->nr_frags++];
+		*frag = *from;
+		if (initial_offset) {
+			frag->offset += initial_offset;
+			frag->len -= initial_offset;
+			initial_offset = 0;
+		}
+		walk->total += frag->len;
+	}
+}
+
+static u32 iptfs_skb_reset_frag_walk(struct iptfs_skb_frag_walk *walk,
+				     u32 offset)
+{
+	/* Adjust offset to refer to internal walk values */
+	offset -= walk->initial_offset;
+
+	/* Get to the correct fragment for offset */
+	while (offset < walk->past) {
+		walk->past -= walk->frags[--walk->fragi].len;
+		if (offset >= walk->past)
+			break;
+	}
+	while (offset >= walk->past + walk->frags[walk->fragi].len)
+		walk->past += walk->frags[walk->fragi++].len;
+
+	/* offset now relative to this current frag */
+	offset -= walk->past;
+	return offset;
+}
+
+/**
+ * iptfs_skb_can_add_frags() - check if ok to add frags from walk to skb
+ * @skb: skb to check for adding frags to
+ * @walk: the walk that will be used as source for frags.
+ * @offset: offset from beginning of original skb to start from.
+ * @len: amount of data to add frag references to in @skb.
+ *
+ * Return: true if ok to add frags.
+ */
+static bool iptfs_skb_can_add_frags(const struct sk_buff *skb,
+				    struct iptfs_skb_frag_walk *walk,
+				    u32 offset, u32 len)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+	u32 fragi, nr_frags, fraglen;
+
+	if (skb_has_frag_list(skb) || skb->pp_recycle != walk->pp_recycle)
+		return false;
+
+	/* Make offset relative to current frag after setting that */
+	offset = iptfs_skb_reset_frag_walk(walk, offset);
+
+	/* Verify we have array space for the fragments we need to add */
+	fragi = walk->fragi;
+	nr_frags = shinfo->nr_frags;
+	while (len && fragi < walk->nr_frags) {
+		skb_frag_t *frag = &walk->frags[fragi];
+
+		fraglen = frag->len;
+		if (offset) {
+			fraglen -= offset;
+			offset = 0;
+		}
+		if (++nr_frags > MAX_SKB_FRAGS)
+			return false;
+		if (len <= fraglen)
+			return true;
+		len -= fraglen;
+		fragi++;
+	}
+	/* We may not copy all @len but what we have will fit. */
+	return true;
+}
+
+/**
+ * iptfs_skb_add_frags() - add a range of fragment references into an skb
+ * @skb: skb to add references into
+ * @walk: the walk to add referenced fragments from.
+ * @offset: offset from beginning of original skb to start from.
+ * @len: amount of data to add frag references to in @skb.
+ *
+ * iptfs_skb_can_add_frags() should be called before this function to verify
+ * that the destination @skb is compatible with the walk and has space in the
+ * array for the to be added frag references.
+ *
+ * Return: The number of bytes not added to @skb b/c we reached the end of the
+ * walk before adding all of @len.
+ */
+static int iptfs_skb_add_frags(struct sk_buff *skb,
+			       struct iptfs_skb_frag_walk *walk, u32 offset,
+			       u32 len)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+	u32 fraglen;
+
+	if (!walk->nr_frags || offset >= walk->total + walk->initial_offset)
+		return len;
+
+	/* make offset relative to current frag after setting that */
+	offset = iptfs_skb_reset_frag_walk(walk, offset);
+
+	while (len && walk->fragi < walk->nr_frags) {
+		skb_frag_t *frag = &walk->frags[walk->fragi];
+		skb_frag_t *tofrag = &shinfo->frags[shinfo->nr_frags];
+
+		*tofrag = *frag;
+		if (offset) {
+			tofrag->offset += offset;
+			tofrag->len -= offset;
+			offset = 0;
+		}
+		__skb_frag_ref(tofrag);
+		shinfo->nr_frags++;
+
+		/* see if we are done */
+		fraglen = tofrag->len;
+		if (len < fraglen) {
+			tofrag->len = len;
+			skb->len += len;
+			skb->data_len += len;
+			return 0;
+		}
+		/* advance to next source fragment */
+		len -= fraglen;			/* careful, use dst bv_len */
+		skb->len += fraglen;		/* careful, "   "    "     */
+		skb->data_len += fraglen;	/* careful, "   "    "     */
+		walk->past += frag->len;	/* careful, use src bv_len */
+		walk->fragi++;
+	}
+	return len;
+}
+
+/* ================================== */
+/* IPTFS Trace Event Definitions      */
+/* ================================== */
+
+#define CREATE_TRACE_POINTS
+#include "trace_iptfs.h"
+
+/* ================================== */
+/* IPTFS Receiving (egress) Functions */
+/* ================================== */
+
+/**
+ * iptfs_pskb_add_frags() - Create and add frags into a new sk_buff.
+ * @tpl: template to create new skb from.
+ * @walk: The source for fragments to add.
+ * @off: The offset into @walk to add frags from, also used with @st and
+ *       @copy_len.
+ * @len: The length of data to add covering frags from @walk into @skb.
+ *       This must be <= @skblen.
+ * @st: The sequence state to copy from into the new head skb.
+ * @copy_len: Copy @copy_len bytes from @st at offset @off into the new skb
+ *            linear space.
+ *
+ * Create a new sk_buff `skb` using the template @tpl. Copy @copy_len bytes from
+ * @st into the new skb linear space, and then add shared fragments from the
+ * frag walk for the remaining @len of data (i.e., @len - @copy_len bytes).
+ *
+ * Return: The newly allocated sk_buff `skb` or NULL if an error occurs.
+ */
+static struct sk_buff *
+iptfs_pskb_add_frags(struct sk_buff *tpl, struct iptfs_skb_frag_walk *walk,
+		     u32 off, u32 len, struct skb_seq_state *st, u32 copy_len)
+{
+	struct sk_buff *skb;
+
+	skb = iptfs_alloc_skb(tpl, copy_len, false);
+	if (!skb)
+		return NULL;
+
+	/* this should not normally be happening */
+	if (!iptfs_skb_can_add_frags(skb, walk, off + copy_len,
+				     len - copy_len)) {
+		kfree_skb(skb);
+		return NULL;
+	}
+
+	if (copy_len &&
+	    skb_copy_seq_read(st, off, skb_put(skb, copy_len), copy_len)) {
+		XFRM_INC_STATS(dev_net(st->root_skb->dev),
+			       LINUX_MIB_XFRMINERROR);
+		kfree_skb(skb);
+		return NULL;
+	}
+
+	iptfs_skb_add_frags(skb, walk, off + copy_len, len - copy_len);
+	return skb;
+}
+
+/**
+ * iptfs_pskb_extract_seq() - Create and load data into a new sk_buff.
+ * @skblen: the total data size for `skb`.
+ * @st: The source for the rest of the data to copy into `skb`.
+ * @off: The offset into @st to copy data from.
+ * @len: The length of data to copy from @st into `skb`. This must be <=
+ *       @skblen.
+ *
+ * Create a new sk_buff `skb` with @skblen of packet data space. If non-zero,
+ * copy @rlen bytes of @runt into `skb`. Then using seq functions copy @len
+ * bytes from @st into `skb` starting from @off.
+ *
+ * It is an error for @len to be greater than the amount of data left in @st.
+ *
+ * Return: The newly allocated sk_buff `skb` or NULL if an error occurs.
+ */
+static struct sk_buff *
+iptfs_pskb_extract_seq(u32 skblen, struct skb_seq_state *st, u32 off, int len)
+{
+	struct sk_buff *skb = iptfs_alloc_skb(st->root_skb, skblen, false);
+
+	if (!skb)
+		return NULL;
+	if (skb_copy_seq_read(st, off, skb_put(skb, len), len)) {
+		XFRM_INC_STATS(dev_net(st->root_skb->dev), LINUX_MIB_XFRMINERROR);
+		kfree_skb(skb);
+		return NULL;
+	}
+	return skb;
+}
+
+/**
+ * iptfs_input_save_runt() - save data in xtfs runt space.
+ * @xtfs: xtfs state
+ * @seq: the current sequence
+ * @buf: packet data
+ * @len: length of packet data
+ *
+ * Save the small (`len`) start of a fragmented packet in `buf` in the xtfs data
+ * runt space.
+ */
+static void iptfs_input_save_runt(struct xfrm_iptfs_data *xtfs, u64 seq,
+				  u8 *buf, int len)
+{
+	memcpy(xtfs->ra_runt, buf, len);
+
+	xtfs->ra_runtlen = len;
+	xtfs->ra_wantseq = seq + 1;
+}
+
+/**
+ * __iptfs_iphlen() - return the v4/v6 header length using packet data.
+ * @data: pointer at octet with version nibble
+ *
+ * The version data has been checked to be valid (i.e., either 4 or 6).
+ *
+ * Return: the IP header size based on the IP version.
+ */
+static u32 __iptfs_iphlen(u8 *data)
+{
+	struct iphdr *iph = (struct iphdr *)data;
+
+	if (iph->version == 0x4)
+		return sizeof(*iph);
+	return sizeof(struct ipv6hdr);
+}
+
+/**
+ * __iptfs_iplen() - return the v4/v6 length using packet data.
+ * @data: pointer to ip (v4/v6) packet header
+ *
+ * Grab the IPv4 or IPv6 length value in the start of the inner packet header
+ * pointed to by `data`. Assumes data len is enough for the length field only.
+ *
+ * The version data has been checked to be valid (i.e., either 4 or 6).
+ *
+ * Return: the length value.
+ */
+static u32 __iptfs_iplen(u8 *data)
+{
+	struct iphdr *iph = (struct iphdr *)data;
+
+	if (iph->version == 0x4)
+		return ntohs(iph->tot_len);
+	return ntohs(((struct ipv6hdr *)iph)->payload_len) +
+		sizeof(struct ipv6hdr);
+}
+
+/**
+ * iptfs_complete_inner_skb() - finish preparing the inner packet for gro recv.
+ * @x: xfrm state
+ * @skb: the inner packet
+ *
+ * Finish the standard xfrm processing on the inner packet prior to sending back
+ * through gro_cells_receive. We do this separately b/c we are building a list
+ * of packets in the hopes that one day a list will be taken by
+ * xfrm_input.
+ */
+static void iptfs_complete_inner_skb(struct xfrm_state *x, struct sk_buff *skb)
+{
+	skb_reset_network_header(skb);
+
+	/* The packet is going back through gro_cells_receive no need to
+	 * set this.
+	 */
+	skb_reset_transport_header(skb);
+
+	/* Packet already has checksum value set. */
+	skb->ip_summed = CHECKSUM_NONE;
+
+	/* Our skb will contain the header data copied when this outer packet
+	 * which contained the start of this inner packet. This is true
+	 * when we allocate a new skb as well as when we reuse the existing skb.
+	 */
+	if (ip_hdr(skb)->version == 0x4) {
+		struct iphdr *iph = ip_hdr(skb);
+
+		if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+			ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, iph);
+		if (!(x->props.flags & XFRM_STATE_NOECN))
+			if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos))
+				IP_ECN_set_ce(iph);
+
+		skb->protocol = htons(ETH_P_IP);
+	} else {
+		struct ipv6hdr *iph = ipv6_hdr(skb);
+
+		if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+			ipv6_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, iph);
+		if (!(x->props.flags & XFRM_STATE_NOECN))
+			if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos))
+				IP6_ECN_set_ce(skb, iph);
+
+		skb->protocol = htons(ETH_P_IPV6);
+	}
+}
+
+static void __iptfs_reassem_done(struct xfrm_iptfs_data *xtfs, bool free)
+{
+	assert_spin_locked(&xtfs->drop_lock);
+
+	/* We don't care if it works locking takes care of things */
+	hrtimer_try_to_cancel(&xtfs->drop_timer);
+	if (free)
+		kfree_skb(xtfs->ra_newskb);
+	xtfs->ra_newskb = NULL;
+}
+
+/**
+ * iptfs_reassem_abort() - In-progress packet is aborted free the state.
+ * @xtfs: xtfs state
+ */
+static void iptfs_reassem_abort(struct xfrm_iptfs_data *xtfs)
+{
+	__iptfs_reassem_done(xtfs, true);
+}
+
+/**
+ * iptfs_reassem_done() - In-progress packet is complete, clear the state.
+ * @xtfs: xtfs state
+ */
+static void iptfs_reassem_done(struct xfrm_iptfs_data *xtfs)
+{
+	__iptfs_reassem_done(xtfs, false);
+}
+
+/**
+ * iptfs_reassem_cont() - Continue the reassembly of an inner packets.
+ * @xtfs: xtfs state
+ * @seq: sequence of current packet
+ * @st: seq read stat for current packet
+ * @skb: current packet
+ * @data: offset into sequential packet data
+ * @blkoff: packet blkoff value
+ * @list: list of skbs to enqueue completed packet on
+ *
+ * Process an IPTFS payload that has a non-zero `blkoff` or when we are
+ * expecting the continuation b/c we have a runt or in-progress packet.
+ *
+ * Return: the new data offset to continue processing from.
+ */
+static u32 iptfs_reassem_cont(struct xfrm_iptfs_data *xtfs, u64 seq,
+			      struct skb_seq_state *st, struct sk_buff *skb,
+			      u32 data, u32 blkoff, struct list_head *list)
+{
+	struct iptfs_skb_frag_walk _fragwalk;
+	struct iptfs_skb_frag_walk *fragwalk = NULL;
+	struct sk_buff *newskb = xtfs->ra_newskb;
+	u32 remaining = skb->len - data;
+	u32 runtlen = xtfs->ra_runtlen;
+	u32 copylen, fraglen, ipremain, iphlen, iphremain, rrem;
+
+	/* Handle packet fragment we aren't expecting */
+	if (!runtlen && !xtfs->ra_newskb)
+		return data + min(blkoff, remaining);
+
+	/* Important to remember that input to this function is an ordered
+	 * packet stream (unless the user disabled the reorder window). Thus if
+	 * we are waiting for, and expecting the next packet so we can continue
+	 * assembly, a newer sequence number indicates older ones are not coming
+	 * (or if they do should be ignored). Technically we can receive older
+	 * ones when the reorder window is disabled; however, the user should
+	 * have disabled fragmentation in this case, and regardless we don't
+	 * deal with it.
+	 *
+	 * blkoff could be zero if the stream is messed up (or it's an all pad
+	 * insertion) be careful to handle that case in each of the below
+	 */
+
+	/* Too old case: This can happen when the reorder window is disabled so
+	 * ordering isn't actually guaranteed.
+	 */
+	if (seq < xtfs->ra_wantseq)
+		return data + remaining;
+
+	/* Too new case: We missed what we wanted cleanup. */
+	if (seq > xtfs->ra_wantseq) {
+		XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMINIPTFSERROR);
+		goto abandon;
+	}
+
+	if (blkoff == 0) {
+		if ((*skb->data & 0xF0) != 0) {
+			XFRM_INC_STATS(xs_net(xtfs->x),
+				       LINUX_MIB_XFRMINIPTFSERROR);
+			goto abandon;
+		}
+		/* Handle all pad case, advance expected sequence number.
+		 * (RFC 9347 S2.2.3)
+		 */
+		xtfs->ra_wantseq++;
+		/* will end parsing */
+		return data + remaining;
+	}
+
+	if (runtlen) {
+		/* Regardless of what happens we're done with the runt */
+		xtfs->ra_runtlen = 0;
+
+		/* The start of this inner packet was at the very end of the last
+		 * iptfs payload which didn't include enough for the ip header
+		 * length field. We must have *at least* that now.
+		 */
+		rrem = sizeof(xtfs->ra_runt) - runtlen;
+		if (remaining < rrem || blkoff < rrem) {
+			XFRM_INC_STATS(xs_net(xtfs->x),
+				       LINUX_MIB_XFRMINIPTFSERROR);
+			goto abandon;
+		}
+
+		/* fill in the runt data */
+		if (skb_copy_seq_read(st, data, &xtfs->ra_runt[runtlen],
+				      rrem)) {
+			XFRM_INC_STATS(xs_net(xtfs->x),
+				       LINUX_MIB_XFRMINBUFFERERROR);
+			goto abandon;
+		}
+
+		/* We have enough data to get the ip length value now,
+		 * allocate an in progress skb
+		 */
+		ipremain = __iptfs_iplen(xtfs->ra_runt);
+		if (ipremain < sizeof(xtfs->ra_runt)) {
+			/* length has to be at least runtsize large */
+			XFRM_INC_STATS(xs_net(xtfs->x),
+				       LINUX_MIB_XFRMINIPTFSERROR);
+			goto abandon;
+		}
+
+		/* For the runt case we don't attempt sharing currently. NOTE:
+		 * Currently, this IPTFS implementation will not create runts.
+		 */
+
+		newskb = iptfs_alloc_skb(skb, ipremain, false);
+		if (!newskb) {
+			XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMINERROR);
+			goto abandon;
+		}
+		xtfs->ra_newskb = newskb;
+
+		/* Copy the runt data into the buffer, but leave data
+		 * pointers the same as normal non-runt case. The extra `rrem`
+		 * recopied bytes are basically cacheline free. Allows using
+		 * same logic below to complete.
+		 */
+		memcpy(skb_put(newskb, runtlen), xtfs->ra_runt,
+		       sizeof(xtfs->ra_runt));
+	}
+
+	/* Continue reassembling the packet */
+	ipremain = __iptfs_iplen(newskb->data);
+	iphlen = __iptfs_iphlen(newskb->data);
+
+	ipremain -= newskb->len;
+	if (blkoff < ipremain) {
+		/* Corrupt data, we don't have enough to complete the packet */
+		XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMINIPTFSERROR);
+		goto abandon;
+	}
+
+	/* We want the IP header in linear space */
+	if (newskb->len < iphlen) {
+		iphremain = iphlen - newskb->len;
+		if (blkoff < iphremain) {
+			XFRM_INC_STATS(xs_net(xtfs->x),
+				       LINUX_MIB_XFRMINIPTFSERROR);
+			goto abandon;
+		}
+		fraglen = min(blkoff, remaining);
+		copylen = min(fraglen, iphremain);
+		if (skb_copy_seq_read(st, data, skb_put(newskb, copylen),
+				      copylen)) {
+			XFRM_INC_STATS(xs_net(xtfs->x),
+				       LINUX_MIB_XFRMINBUFFERERROR);
+			goto abandon;
+		}
+		/* this is a silly condition that might occur anyway */
+		if (copylen < iphremain) {
+			xtfs->ra_wantseq++;
+			return data + fraglen;
+		}
+		/* update data and things derived from it */
+		data += copylen;
+		blkoff -= copylen;
+		remaining -= copylen;
+		ipremain -= copylen;
+	}
+
+	fraglen = min(blkoff, remaining);
+	copylen = min(fraglen, ipremain);
+
+	/* If we may have the opportunity to share prepare a fragwalk. */
+	if (!skb_has_frag_list(skb) && !skb_has_frag_list(newskb) &&
+	    (skb->head_frag || skb->len == skb->data_len) &&
+	    skb->pp_recycle == newskb->pp_recycle) {
+		fragwalk = &_fragwalk;
+		iptfs_skb_prepare_frag_walk(skb, data, fragwalk);
+	}
+
+	/* Try share then copy. */
+	if (fragwalk &&
+	    iptfs_skb_can_add_frags(newskb, fragwalk, data, copylen)) {
+		iptfs_skb_add_frags(newskb, fragwalk, data, copylen);
+	} else {
+		/* copy fragment data into newskb */
+		if (skb_copy_seq_read(st, data, skb_put(newskb, copylen),
+				      copylen)) {
+			XFRM_INC_STATS(xs_net(xtfs->x),
+				       LINUX_MIB_XFRMINBUFFERERROR);
+			goto abandon;
+		}
+	}
+
+	if (copylen < ipremain) {
+		xtfs->ra_wantseq++;
+	} else {
+		/* We are done with packet reassembly! */
+		iptfs_reassem_done(xtfs);
+		iptfs_complete_inner_skb(xtfs->x, newskb);
+		list_add_tail(&newskb->list, list);
+	}
+
+	/* will continue on to new data block or end */
+	return data + fraglen;
+
+abandon:
+	if (xtfs->ra_newskb) {
+		iptfs_reassem_abort(xtfs);
+	} else {
+		xtfs->ra_runtlen = 0;
+		xtfs->ra_wantseq = 0;
+	}
+	/* skip past fragment, maybe to end */
+	return data + min(blkoff, remaining);
+}
+
+static bool __input_process_payload(struct xfrm_state *x, u32 data,
+				    struct skb_seq_state *skbseq,
+				    struct list_head *sublist)
+{
+	u8 hbytes[sizeof(struct ipv6hdr)];
+	struct iptfs_skb_frag_walk _fragwalk;
+	struct iptfs_skb_frag_walk *fragwalk = NULL;
+	struct sk_buff *defer, *first_skb, *next, *skb;
+	const unsigned char *old_mac;
+	struct xfrm_iptfs_data *xtfs;
+	struct iphdr *iph;
+	struct net *net;
+	u32 first_iplen, iphlen, iplen, remaining, tail;
+	u32 capturelen;
+	u64 seq;
+
+	xtfs = x->mode_data;
+	net = xs_net(x);
+	skb = skbseq->root_skb;
+	first_skb = NULL;
+	defer = NULL;
+
+	seq = __esp_seq(skb);
+
+	/* Save the old mac header if set */
+	old_mac = skb_mac_header_was_set(skb) ? skb_mac_header(skb) : NULL;
+
+	/* New packets */
+
+	tail = skb->len;
+	while (data < tail) {
+		__be16 protocol = 0;
+
+		/* Gather information on the next data block.
+		 * `data` points to the start of the data block.
+		 */
+		remaining = tail - data;
+
+		/* try and copy enough bytes to read length from ipv4/ipv6 */
+		iphlen = min_t(u32, remaining, 6);
+		if (skb_copy_seq_read(skbseq, data, hbytes, iphlen)) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
+			goto done;
+		}
+
+		iph = (struct iphdr *)hbytes;
+		if (iph->version == 0x4) {
+			/* must have at least tot_len field present */
+			if (remaining < 4) {
+				/* save the bytes we have, advance data and exit */
+				iptfs_input_save_runt(xtfs, seq, hbytes,
+						      remaining);
+				data += remaining;
+				break;
+			}
+
+			iplen = be16_to_cpu(iph->tot_len);
+			iphlen = iph->ihl << 2;
+			protocol = cpu_to_be16(ETH_P_IP);
+			XFRM_MODE_SKB_CB(skbseq->root_skb)->tos = iph->tos;
+		} else if (iph->version == 0x6) {
+			/* must have at least payload_len field present */
+			if (remaining < 6) {
+				/* save the bytes we have, advance data and exit */
+				iptfs_input_save_runt(xtfs, seq, hbytes,
+						      remaining);
+				data += remaining;
+				break;
+			}
+
+			iplen = be16_to_cpu(((struct ipv6hdr *)hbytes)->payload_len);
+			iplen += sizeof(struct ipv6hdr);
+			iphlen = sizeof(struct ipv6hdr);
+			protocol = cpu_to_be16(ETH_P_IPV6);
+			XFRM_MODE_SKB_CB(skbseq->root_skb)->tos =
+				ipv6_get_dsfield((struct ipv6hdr *)iph);
+		} else if (iph->version == 0x0) {
+			/* pad */
+			data = tail;
+			break;
+		} else {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
+			goto done;
+		}
+
+		if (unlikely(skbseq->stepped_offset)) {
+			/* We need to reset our seq read, it can't backup at
+			 * this point.
+			 */
+			struct sk_buff *save = skbseq->root_skb;
+
+			skb_abort_seq_read(skbseq);
+			skb_prepare_seq_read(save, data, tail, skbseq);
+		}
+
+		if (first_skb) {
+			skb = NULL;
+		} else {
+			first_skb = skb;
+			first_iplen = iplen;
+			fragwalk = NULL;
+
+			/* We are going to skip over `data` bytes to reach the
+			 * start of the IP header of `iphlen` len for `iplen`
+			 * inner packet.
+			 */
+
+			if (skb_has_frag_list(skb)) {
+				defer = skb;
+				skb = NULL;
+			} else if (data + iphlen <= skb_headlen(skb) &&
+				   /* make sure our header is 32-bit aligned? */
+				   /* ((uintptr_t)(skb->data + data) & 0x3) == 0 && */
+				   skb_tailroom(skb) + tail - data >= iplen) {
+				/* Reuse the received skb.
+				 *
+				 * We have enough headlen to pull past any
+				 * initial fragment data, leaving at least the
+				 * IP header in the linear buffer space.
+				 *
+				 * For linear buffer space we only require that
+				 * linear buffer space is large enough to
+				 * eventually hold the entire reassembled
+				 * packet (by including tailroom in the check).
+				 *
+				 * For non-linear tailroom is 0 and so we only
+				 * re-use if the entire packet is present
+				 * already.
+				 *
+				 * NOTE: there are many more options for
+				 * sharing, KISS for now. Also, this can produce
+				 * skb's with the IP header unaligned to 32
+				 * bits. If that ends up being a problem then a
+				 * check should be added to the conditional
+				 * above that the header lies on a 32-bit
+				 * boundary as well.
+				 */
+				skb_pull(skb, data);
+
+				/* our range just changed */
+				data = 0;
+				tail = skb->len;
+				remaining = skb->len;
+
+				skb->protocol = protocol;
+				skb_mac_header_rebuild(skb);
+				if (skb->mac_len)
+					eth_hdr(skb)->h_proto = skb->protocol;
+
+				/* all pointers could be changed now reset walk */
+				skb_abort_seq_read(skbseq);
+				skb_prepare_seq_read(skb, data, tail, skbseq);
+			} else if (skb->head_frag &&
+				   /* We have the IP header right now */
+				   remaining >= iphlen) {
+				fragwalk = &_fragwalk;
+				iptfs_skb_prepare_frag_walk(skb, data, fragwalk);
+				defer = skb;
+				skb = NULL;
+			} else {
+				/* We couldn't reuse the input skb so allocate a
+				 * new one.
+				 */
+				defer = skb;
+				skb = NULL;
+			}
+
+			/* Don't trim `first_skb` until the end as we are
+			 * walking that data now.
+			 */
+		}
+
+		capturelen = min(iplen, remaining);
+		if (!skb) {
+			if (!fragwalk ||
+			    /* Large enough to be worth sharing */
+			    iplen < IPTFS_PKT_SHARE_MIN ||
+			    /* Have IP header + some data to share. */
+			    capturelen <= iphlen ||
+			    /* Try creating skb and adding frags */
+			    !(skb = iptfs_pskb_add_frags(first_skb, fragwalk,
+							 data, capturelen,
+							 skbseq, iphlen))) {
+				skb = iptfs_pskb_extract_seq(iplen, skbseq, data, capturelen);
+			}
+			if (!skb) {
+				/* skip to next packet or done */
+				data += capturelen;
+				continue;
+			}
+
+			skb->protocol = protocol;
+			if (old_mac) {
+				/* rebuild the mac header */
+				skb_set_mac_header(skb, -first_skb->mac_len);
+				memcpy(skb_mac_header(skb), old_mac, first_skb->mac_len);
+				eth_hdr(skb)->h_proto = skb->protocol;
+			}
+		}
+
+		data += capturelen;
+
+		if (skb->len < iplen) {
+			/* Start reassembly */
+			spin_lock(&xtfs->drop_lock);
+
+			xtfs->ra_newskb = skb;
+			xtfs->ra_wantseq = seq + 1;
+			if (!hrtimer_is_queued(&xtfs->drop_timer)) {
+				/* softirq blocked lest the timer fire and interrupt us */
+				hrtimer_start(&xtfs->drop_timer,
+					      xtfs->drop_time_ns,
+					      IPTFS_HRTIMER_MODE);
+			}
+
+			spin_unlock(&xtfs->drop_lock);
+
+			break;
+		}
+
+		iptfs_complete_inner_skb(x, skb);
+		list_add_tail(&skb->list, sublist);
+	}
+
+	if (data != tail)
+		/* this should not happen from the above code */
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMINIPTFSERROR);
+
+	if (first_skb && first_iplen && !defer && first_skb != xtfs->ra_newskb) {
+		/* first_skb is queued b/c !defer and not partial */
+		if (pskb_trim(first_skb, first_iplen)) {
+			/* error trimming */
+			list_del(&first_skb->list);
+			defer = first_skb;
+		}
+		first_skb->ip_summed = CHECKSUM_NONE;
+	}
+
+	/* Send the packets! */
+	list_for_each_entry_safe(skb, next, sublist, list) {
+		skb_list_del_init(skb);
+		if (xfrm_input(skb, 0, 0, -2))
+			kfree_skb(skb);
+	}
+done:
+	skb = skbseq->root_skb;
+	skb_abort_seq_read(skbseq);
+
+	if (defer) {
+		consume_skb(defer);
+	} else if (!first_skb) {
+		/* skb is the original passed in skb, but we didn't get far
+		 * enough to process it as the first_skb, if we had it would
+		 * either be save in ra_newskb, trimmed and sent on as an skb or
+		 * placed in defer to be freed.
+		 */
+		kfree_skb(skb);
+	}
+	return true;
+}
+
+/**
+ * iptfs_input_ordered() - handle next in order IPTFS payload.
+ * @x: xfrm state
+ * @skb: current packet
+ *
+ * Process the IPTFS payload in `skb` and consume it afterwards.
+ */
+static void iptfs_input_ordered(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct ip_iptfs_cc_hdr iptcch;
+	struct skb_seq_state skbseq;
+	struct list_head sublist; /* rename this it's just a list */
+	struct xfrm_iptfs_data *xtfs;
+	struct ip_iptfs_hdr *ipth;
+	struct net *net;
+	u32 blkoff, data, remaining;
+	bool consumed = false;
+	u64 seq;
+
+	xtfs = x->mode_data;
+	net = xs_net(x);
+
+	seq = __esp_seq(skb);
+
+	/* Large enough to hold both types of header */
+	ipth = (struct ip_iptfs_hdr *)&iptcch;
+
+	skb_prepare_seq_read(skb, 0, skb->len, &skbseq);
+
+	/* Get the IPTFS header and validate it */
+
+	if (skb_copy_seq_read(&skbseq, 0, ipth, sizeof(*ipth))) {
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
+		goto done;
+	}
+	data = sizeof(*ipth);
+
+	trace_iptfs_egress_recv(skb, xtfs, be16_to_cpu(ipth->block_offset));
+
+	/* Set data past the basic header */
+	if (ipth->subtype == IPTFS_SUBTYPE_CC) {
+		/* Copy the rest of the CC header */
+		remaining = sizeof(iptcch) - sizeof(*ipth);
+		if (skb_copy_seq_read(&skbseq, data, ipth + 1, remaining)) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
+			goto done;
+		}
+		data += remaining;
+	} else if (ipth->subtype != IPTFS_SUBTYPE_BASIC) {
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
+		goto done;
+	}
+
+	if (ipth->flags != 0) {
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
+		goto done;
+	}
+
+	INIT_LIST_HEAD(&sublist);
+
+	/* Handle fragment at start of payload, and/or waiting reassembly. */
+
+	blkoff = ntohs(ipth->block_offset);
+	/* check before locking i.e., maybe */
+	if (blkoff || xtfs->ra_runtlen || xtfs->ra_newskb) {
+		spin_lock(&xtfs->drop_lock);
+
+		/* check again after lock */
+		if (blkoff || xtfs->ra_runtlen || xtfs->ra_newskb) {
+			data = iptfs_reassem_cont(xtfs, seq, &skbseq, skb, data,
+						  blkoff, &sublist);
+		}
+
+		spin_unlock(&xtfs->drop_lock);
+	}
+
+	/* New packets */
+	consumed = __input_process_payload(x, data, &skbseq, &sublist);
+done:
+	if (!consumed) {
+		skb = skbseq.root_skb;
+		skb_abort_seq_read(&skbseq);
+		kfree_skb(skb);
+	}
+}
+
+/* ------------------------------- */
+/* Input (Egress) Re-ordering Code */
+/* ------------------------------- */
+
+static void __vec_shift(struct xfrm_iptfs_data *xtfs, u32 shift)
+{
+	u32 savedlen = xtfs->w_savedlen;
+
+	if (shift > savedlen)
+		shift = savedlen;
+	if (shift != savedlen)
+		memcpy(xtfs->w_saved, xtfs->w_saved + shift,
+		       (savedlen - shift) * sizeof(*xtfs->w_saved));
+	memset(xtfs->w_saved + savedlen - shift, 0,
+	       shift * sizeof(*xtfs->w_saved));
+	xtfs->w_savedlen -= shift;
+}
+
+static void __reorder_past(struct xfrm_iptfs_data *xtfs, struct sk_buff *inskb,
+			   struct list_head *freelist)
+{
+	list_add_tail(&inskb->list, freelist);
+}
+
+static u32 __reorder_drop(struct xfrm_iptfs_data *xtfs, struct list_head *list)
+
+{
+	struct skb_wseq *s, *se;
+	const u32 savedlen = xtfs->w_savedlen;
+	time64_t now = ktime_get_raw_fast_ns();
+	u32 count = 0;
+	u32 scount = 0;
+
+	if (xtfs->w_saved[0].drop_time > now)
+		goto set_timer;
+
+	++xtfs->w_wantseq;
+
+	/* Keep flushing packets until we reach a drop time greater than now. */
+	s = xtfs->w_saved;
+	se = s + savedlen;
+	do {
+		/* Walking past empty slots until we reach a packet */
+		for (; s < se && !s->skb; s++) {
+			if (s->drop_time > now)
+				goto outerdone;
+		}
+		/* Sending packets until we hit another empty slot. */
+		for (; s < se && s->skb; scount++, s++)
+			list_add_tail(&s->skb->list, list);
+	} while (s < se);
+outerdone:
+
+	count = s - xtfs->w_saved;
+	if (count) {
+		xtfs->w_wantseq += count;
+
+		/* Shift handled slots plus final empty slot into slot 0. */
+		__vec_shift(xtfs, count);
+	}
+
+	if (xtfs->w_savedlen) {
+set_timer:
+		/* Drifting is OK */
+		hrtimer_start(&xtfs->drop_timer,
+			      xtfs->w_saved[0].drop_time - now,
+			      IPTFS_HRTIMER_MODE);
+	}
+	return scount;
+}
+
+static void __reorder_this(struct xfrm_iptfs_data *xtfs, struct sk_buff *inskb,
+			   struct list_head *list)
+{
+	struct skb_wseq *s, *se;
+	const u32 savedlen = xtfs->w_savedlen;
+	u32 count = 0;
+
+	/* Got what we wanted. */
+	list_add_tail(&inskb->list, list);
+	++xtfs->w_wantseq;
+	if (!savedlen)
+		return;
+
+	/* Flush remaining consecutive packets. */
+
+	/* Keep sending until we hit another missed pkt. */
+	for (s = xtfs->w_saved, se = s + savedlen; s < se && s->skb; s++)
+		list_add_tail(&s->skb->list, list);
+	count = s - xtfs->w_saved;
+	if (count)
+		xtfs->w_wantseq += count;
+
+	/* Shift handled slots plus final empty slot into slot 0. */
+	__vec_shift(xtfs, count + 1);
+}
+
+/* Set the slot's drop time and all the empty slots below it until reaching a
+ * filled slot which will already be set.
+ */
+static void iptfs_set_window_drop_times(struct xfrm_iptfs_data *xtfs, int index)
+{
+	const u32 savedlen = xtfs->w_savedlen;
+	struct skb_wseq *s = xtfs->w_saved;
+	time64_t drop_time;
+
+	assert_spin_locked(&xtfs->drop_lock);
+
+	if (savedlen > index + 1) {
+		/* we are below another, our drop time and the timer are already set */
+		return;
+	}
+	/* we are the most future so get a new drop time. */
+	drop_time = ktime_get_raw_fast_ns();
+	drop_time += xtfs->drop_time_ns;
+
+	/* Walk back through the array setting drop times as we go */
+	s[index].drop_time = drop_time;
+	while (index-- > 0 && !s[index].skb)
+		s[index].drop_time = drop_time;
+
+	/* If we walked all the way back, schedule the drop timer if needed */
+	if (index == -1 && !hrtimer_is_queued(&xtfs->drop_timer))
+		hrtimer_start(&xtfs->drop_timer, xtfs->drop_time_ns,
+			      IPTFS_HRTIMER_MODE);
+}
+
+static void __reorder_future_fits(struct xfrm_iptfs_data *xtfs,
+				  struct sk_buff *inskb,
+				  struct list_head *freelist)
+{
+	const u64 inseq = __esp_seq(inskb);
+	const u64 wantseq = xtfs->w_wantseq;
+	const u64 distance = inseq - wantseq;
+	const u32 savedlen = xtfs->w_savedlen;
+	const u32 index = distance - 1;
+
+	/* Handle future sequence number received which fits in the window.
+	 *
+	 * We know we don't have the seq we want so we won't be able to flush
+	 * anything.
+	 */
+
+	/* slot count is 4, saved size is 3 savedlen is 2
+	 *
+	 * "window boundary" is based on the fixed window size
+	 * distance is also slot number
+	 * index is an array index (i.e., - 1 of slot)
+	 * : : - implicit NULL after array len
+	 *
+	 *          +--------- used length (savedlen == 2)
+	 *          |   +----- array size (nslots - 1 == 3)
+	 *          |   |   + window boundary (nslots == 4)
+	 *          V   V | V
+	 *                |
+	 *  0   1   2   3 |   slot number
+	 * ---  0   1   2 |   array index
+	 *     [-] [b] : :|   array
+	 *
+	 * "2" "3" "4" *5*|   seq numbers
+	 *
+	 * We receive seq number 5
+	 * distance == 3 [inseq(5) - w_wantseq(2)]
+	 * index == 2 [distance(6) - 1]
+	 */
+
+	if (xtfs->w_saved[index].skb) {
+		/* a dup of a future */
+		list_add_tail(&inskb->list, freelist);
+		return;
+	}
+
+	xtfs->w_saved[index].skb = inskb;
+	xtfs->w_savedlen = max(savedlen, index + 1);
+	iptfs_set_window_drop_times(xtfs, index);
+}
+
+static void __reorder_future_shifts(struct xfrm_iptfs_data *xtfs,
+				    struct sk_buff *inskb,
+				    struct list_head *list)
+{
+	const u32 nslots = xtfs->cfg.reorder_win_size + 1;
+	const u64 inseq = __esp_seq(inskb);
+	u32 savedlen = xtfs->w_savedlen;
+	u64 wantseq = xtfs->w_wantseq;
+	struct skb_wseq *wnext;
+	struct sk_buff *slot0;
+	u32 beyond, shifting, slot;
+	u64 distance;
+
+	/* Handle future sequence number received.
+	 *
+	 * IMPORTANT: we are at least advancing w_wantseq (i.e., wantseq) by 1
+	 * b/c we are beyond the window boundary.
+	 *
+	 * We know we don't have the wantseq so that counts as a drop.
+	 */
+
+	/* example: slot count is 4, array size is 3 savedlen is 2, slot 0 is
+	 * the missing sequence number.
+	 *
+	 * the final slot at savedlen (index savedlen - 1) is always occupied.
+	 *
+	 * beyond is "beyond array size" not savedlen.
+	 *
+	 *          +--------- array length (savedlen == 2)
+	 *          |   +----- array size (nslots - 1 == 3)
+	 *          |   | +- window boundary (nslots == 4)
+	 *          V   V |
+	 *                |
+	 *  0   1   2   3 |   slot number
+	 * ---  0   1   2 |   array index
+	 *     [b] [c] : :|   array
+	 *                |
+	 * "2" "3" "4" "5"|*6*  seq numbers
+	 *
+	 * We receive seq number 6
+	 * distance == 4 [inseq(6) - w_wantseq(2)]
+	 * newslot == distance
+	 * index == 3 [distance(4) - 1]
+	 * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))]
+	 * shifting == 1 [min(savedlen(2), beyond(1)]
+	 * slot0_skb == [b], and should match w_wantseq
+	 *
+	 *                +--- window boundary (nslots == 4)
+	 *  0   1   2   3 | 4   slot number
+	 * ---  0   1   2 | 3   array index
+	 *     [b] : : : :|     array
+	 * "2" "3" "4" "5" *6*  seq numbers
+	 *
+	 * We receive seq number 6
+	 * distance == 4 [inseq(6) - w_wantseq(2)]
+	 * newslot == distance
+	 * index == 3 [distance(4) - 1]
+	 * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))]
+	 * shifting == 1 [min(savedlen(1), beyond(1)]
+	 * slot0_skb == [b] and should match w_wantseq
+	 *
+	 *                +-- window boundary (nslots == 4)
+	 *  0   1   2   3 | 4   5   6   slot number
+	 * ---  0   1   2 | 3   4   5   array index
+	 *     [-] [c] : :|             array
+	 * "2" "3" "4" "5" "6" "7" *8*  seq numbers
+	 *
+	 * savedlen = 2, beyond = 3
+	 * iter 1: slot0 == NULL, missed++, lastdrop = 2 (2+1-1), slot0 = [-]
+	 * iter 2: slot0 == NULL, missed++, lastdrop = 3 (2+2-1), slot0 = [c]
+	 * 2 < 3, extra = 1 (3-2), missed += extra, lastdrop = 4 (2+2+1-1)
+	 *
+	 * We receive seq number 8
+	 * distance == 6 [inseq(8) - w_wantseq(2)]
+	 * newslot == distance
+	 * index == 5 [distance(6) - 1]
+	 * beyond == 3 [newslot(6) - lastslot((nslots(4) - 1))]
+	 * shifting == 2 [min(savedlen(2), beyond(3)]
+	 *
+	 * slot0_skb == NULL changed from [b] when "savedlen < beyond" is true.
+	 */
+
+	/* Now send any packets that are being shifted out of saved, and account
+	 * for missing packets that are exiting the window as we shift it.
+	 */
+
+	distance = inseq - wantseq;
+	beyond = distance - (nslots - 1);
+
+	/* If savedlen > beyond we are shifting some, else all. */
+	shifting = min(savedlen, beyond);
+
+	/* slot0 is the buf that just shifted out and into slot0 */
+	slot0 = NULL;
+	wnext = xtfs->w_saved;
+	for (slot = 1; slot <= shifting; slot++, wnext++) {
+		/* handle what was in slot0 before we occupy it */
+		if (slot0)
+			list_add_tail(&slot0->list, list);
+		slot0 = wnext->skb;
+		wnext->skb = NULL;
+	}
+
+	/* slot0 is now either NULL (in which case it's what we now are waiting
+	 * for, or a buf in which case we need to handle it like we received it;
+	 * however, we may be advancing past that buffer as well..
+	 */
+
+	/* Handle case where we need to shift more than we had saved, slot0 will
+	 * be NULL iff savedlen is 0, otherwise slot0 will always be
+	 * non-NULL b/c we shifted the final element, which is always set if
+	 * there is any saved, into slot0.
+	 */
+	if (savedlen < beyond) {
+		if (savedlen != 0)
+			list_add_tail(&slot0->list, list);
+		slot0 = NULL;
+		/* slot0 has had an empty slot pushed into it */
+	}
+
+	/* Remove the entries */
+	__vec_shift(xtfs, beyond);
+
+	/* Advance want seq */
+	xtfs->w_wantseq += beyond;
+
+	/* Process drops here when implementing congestion control */
+
+	/* We've shifted. plug the packet in at the end. */
+	xtfs->w_savedlen = nslots - 1;
+	xtfs->w_saved[xtfs->w_savedlen - 1].skb = inskb;
+	iptfs_set_window_drop_times(xtfs, xtfs->w_savedlen - 1);
+
+	/* if we don't have a slot0 then we must wait for it */
+	if (!slot0)
+		return;
+
+	/* If slot0, seq must match new want seq */
+
+	/* slot0 is valid, treat like we received expected. */
+	__reorder_this(xtfs, slot0, list);
+}
+
+/* Receive a new packet into the reorder window. Return a list of ordered
+ * packets from the window.
+ */
+static void iptfs_input_reorder(struct xfrm_iptfs_data *xtfs,
+				struct sk_buff *inskb, struct list_head *list,
+				struct list_head *freelist)
+{
+	const u32 nslots = xtfs->cfg.reorder_win_size + 1;
+	u64 inseq = __esp_seq(inskb);
+	u64 wantseq;
+
+	assert_spin_locked(&xtfs->drop_lock);
+
+	if (unlikely(!xtfs->w_seq_set)) {
+		xtfs->w_seq_set = true;
+		xtfs->w_wantseq = inseq;
+	}
+	wantseq = xtfs->w_wantseq;
+
+	if (likely(inseq == wantseq))
+		__reorder_this(xtfs, inskb, list);
+	else if (inseq < wantseq)
+		__reorder_past(xtfs, inskb, freelist);
+	else if ((inseq - wantseq) < nslots)
+		__reorder_future_fits(xtfs, inskb, freelist);
+	else
+		__reorder_future_shifts(xtfs, inskb, list);
+}
+
+/**
+ * iptfs_drop_timer() - Handle drop timer expiry.
+ * @me: the timer
+ *
+ * This is similar to our input function.
+ *
+ * The drop timer is set when we start an in progress reassembly, and also when
+ * we save a future packet in the window saved array.
+ *
+ * NOTE packets in the save window are always newer WRT drop times as
+ * they get further in the future. i.e. for:
+ *
+ *    if slots (S0, S1, ... Sn) and `Dn` is the drop time for slot `Sn`,
+ *    then D(n-1) <= D(n).
+ *
+ * So, regardless of why the timer is firing we can always discard any inprogress
+ * fragment; either it's the reassembly timer, or slot 0 is going to be
+ * dropped as S0 must have the most recent drop time, and slot 0 holds the
+ * continuation fragment of the in progress packet.
+ *
+ * Returns HRTIMER_NORESTART.
+ */
+static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me)
+{
+	struct sk_buff *skb, *next;
+	struct list_head list;
+	struct xfrm_iptfs_data *xtfs;
+	struct xfrm_state *x;
+	u32 count;
+
+	xtfs = container_of(me, typeof(*xtfs), drop_timer);
+	x = xtfs->x;
+
+	INIT_LIST_HEAD(&list);
+
+	spin_lock(&xtfs->drop_lock);
+
+	/* Drop any in progress packet */
+	skb = xtfs->ra_newskb;
+	xtfs->ra_newskb = NULL;
+
+	/* Now drop as many packets as we should from the reordering window
+	 * saved array
+	 */
+	count = xtfs->w_savedlen ? __reorder_drop(xtfs, &list) : 0;
+
+	spin_unlock(&xtfs->drop_lock);
+
+	if (skb)
+		kfree_skb_reason(skb, SKB_DROP_REASON_FRAG_REASM_TIMEOUT);
+
+	if (count) {
+		list_for_each_entry_safe(skb, next, &list, list) {
+			skb_list_del_init(skb);
+			iptfs_input_ordered(x, skb);
+		}
+	}
+
+	return HRTIMER_NORESTART;
+}
+
+/**
+ * iptfs_input() - handle receipt of iptfs payload
+ * @x: xfrm state
+ * @skb: the packet
+ *
+ * We have an IPTFS payload order it if needed, then process newly in order
+ * packets.
+ *
+ * Return: -EINPROGRESS to inform xfrm_input to stop processing the skb.
+ */
+static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct list_head freelist, list;
+	struct xfrm_iptfs_data *xtfs = x->mode_data;
+	struct sk_buff *next;
+
+	/* Fast path for no reorder window. */
+	if (xtfs->cfg.reorder_win_size == 0) {
+		iptfs_input_ordered(x, skb);
+		goto done;
+	}
+
+	/* Fetch list of in-order packets from the reordering window as well as
+	 * a list of buffers we need to now free.
+	 */
+	INIT_LIST_HEAD(&list);
+	INIT_LIST_HEAD(&freelist);
+
+	spin_lock(&xtfs->drop_lock);
+	iptfs_input_reorder(xtfs, skb, &list, &freelist);
+	spin_unlock(&xtfs->drop_lock);
+
+	list_for_each_entry_safe(skb, next, &list, list) {
+		skb_list_del_init(skb);
+		iptfs_input_ordered(x, skb);
+	}
+
+	list_for_each_entry_safe(skb, next, &freelist, list) {
+		skb_list_del_init(skb);
+		kfree_skb(skb);
+	}
+done:
+	/* We always have dealt with the input SKB, either we are re-using it,
+	 * or we have freed it. Return EINPROGRESS so that xfrm_input stops
+	 * processing it.
+	 */
+	return -EINPROGRESS;
+}
+
+/* ================================= */
+/* IPTFS Sending (ingress) Functions */
+/* ================================= */
+
+/* ------------------------- */
+/* Enqueue to send functions */
+/* ------------------------- */
+
+/**
+ * iptfs_enqueue() - enqueue packet if ok to send.
+ * @xtfs: xtfs state
+ * @skb: the packet
+ *
+ * Return: true if packet enqueued.
+ */
+static bool iptfs_enqueue(struct xfrm_iptfs_data *xtfs, struct sk_buff *skb)
+{
+	u64 newsz = xtfs->queue_size + skb->len;
+	struct iphdr *iph;
+
+	assert_spin_locked(&xtfs->x->lock);
+
+	if (newsz > xtfs->cfg.max_queue_size)
+		return false;
+
+	/* Set ECN CE if we are above our ECN queue threshold */
+	if (newsz > xtfs->ecn_queue_size) {
+		iph = ip_hdr(skb);
+		if (iph->version == 4)
+			IP_ECN_set_ce(iph);
+		else if (iph->version == 6)
+			IP6_ECN_set_ce(skb, ipv6_hdr(skb));
+	}
+
+	__skb_queue_tail(&xtfs->queue, skb);
+	xtfs->queue_size += skb->len;
+	return true;
+}
+
+static int iptfs_get_cur_pmtu(struct xfrm_state *x, struct xfrm_iptfs_data *xtfs,
+			      struct sk_buff *skb)
+{
+	struct xfrm_dst *xdst = (struct xfrm_dst *)skb_dst(skb);
+	u32 payload_mtu = xtfs->payload_mtu;
+	u32 pmtu = __iptfs_get_inner_mtu(x, xdst->child_mtu_cached);
+
+	if (payload_mtu && payload_mtu < pmtu)
+		pmtu = payload_mtu;
+
+	return pmtu;
+}
+
+static int iptfs_is_too_big(struct sock *sk, struct sk_buff *skb, u32 pmtu)
+{
+	if (skb->len <= pmtu)
+		return 0;
+
+	/* We only send ICMP too big if the user has configured us as
+	 * dont-fragment.
+	 */
+	if (skb->dev)
+		XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMOUTERROR);
+
+	if (sk)
+		xfrm_local_error(skb, pmtu);
+	else if (ip_hdr(skb)->version == 4)
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(pmtu));
+	else
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, pmtu);
+
+	return 1;
+}
+
+/* IPv4/IPv6 packet ingress to IPTFS tunnel, arrange to send in IPTFS payload
+ * (i.e., aggregating or fragmenting as appropriate).
+ * This is set in dst->output for an SA.
+ */
+static int iptfs_output_collect(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct xfrm_state *x = dst->xfrm;
+	struct xfrm_iptfs_data *xtfs = x->mode_data;
+	struct sk_buff *segs, *nskb;
+	u32 pmtu = 0;
+	bool ok = true;
+	bool was_gso;
+
+	/* We have hooked into dst_entry->output which means we have skipped the
+	 * protocol specific netfilter (see xfrm4_output, xfrm6_output).
+	 * when our timer runs we will end up calling xfrm_output directly on
+	 * the encapsulated traffic.
+	 *
+	 * For both cases this is the NF_INET_POST_ROUTING hook which allows
+	 * changing the skb->dst entry which then may not be xfrm based anymore
+	 * in which case a REROUTED flag is set. and dst_output is called.
+	 *
+	 * For IPv6 we are also skipping fragmentation handling for local
+	 * sockets, which may or may not be good depending on our tunnel DF
+	 * setting. Normally with fragmentation supported we want to skip this
+	 * fragmentation.
+	 */
+
+	if (xtfs->cfg.dont_frag)
+		pmtu = iptfs_get_cur_pmtu(x, xtfs, skb);
+
+	/* Break apart GSO skbs. If the queue is nearing full then we want the
+	 * accounting and queuing to be based on the individual packets not on the
+	 * aggregate GSO buffer.
+	 */
+	was_gso = skb_is_gso(skb);
+	if (!was_gso) {
+		segs = skb;
+	} else {
+		segs = skb_gso_segment(skb, 0);
+		if (IS_ERR_OR_NULL(segs)) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
+			kfree_skb(skb);
+			if (IS_ERR(segs))
+				return PTR_ERR(segs);
+			return -EINVAL;
+		}
+		consume_skb(skb);
+		skb = NULL;
+	}
+
+	/* We can be running on multiple cores and from the network softirq or
+	 * from user context depending on where the packet is coming from.
+	 */
+	spin_lock_bh(&x->lock);
+
+	skb_list_walk_safe(segs, skb, nskb) {
+		skb_mark_not_on_list(skb);
+
+		/* Once we drop due to no queue space we continue to drop the
+		 * rest of the packets from that GRO.
+		 */
+		if (!ok) {
+nospace:
+			trace_iptfs_no_queue_space(skb, xtfs, pmtu, was_gso);
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOQSPACE);
+			kfree_skb_reason(skb, SKB_DROP_REASON_FULL_RING);
+			continue;
+		}
+
+		/* If the user indicated no iptfs fragmenting check before
+		 * enqueue.
+		 */
+		if (xtfs->cfg.dont_frag && iptfs_is_too_big(sk, skb, pmtu)) {
+			trace_iptfs_too_big(skb, xtfs, pmtu, was_gso);
+			kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
+			continue;
+		}
+
+		/* Enqueue to send in tunnel */
+		ok = iptfs_enqueue(xtfs, skb);
+		if (!ok)
+			goto nospace;
+
+		trace_iptfs_enqueue(skb, xtfs, pmtu, was_gso);
+	}
+
+	/* Start a delay timer if we don't have one yet */
+	if (!hrtimer_is_queued(&xtfs->iptfs_timer)) {
+		hrtimer_start(&xtfs->iptfs_timer, xtfs->init_delay_ns, IPTFS_HRTIMER_MODE);
+		xtfs->iptfs_settime = ktime_get_raw_fast_ns();
+		trace_iptfs_timer_start(xtfs, xtfs->init_delay_ns);
+	}
+
+	spin_unlock_bh(&x->lock);
+	return 0;
+}
+
+/* -------------------------- */
+/* Dequeue and send functions */
+/* -------------------------- */
+
+static void iptfs_output_prepare_skb(struct sk_buff *skb, u32 blkoff)
+{
+	struct ip_iptfs_hdr *h;
+	size_t hsz = sizeof(*h);
+
+	/* now reset values to be pointing at the rest of the packets */
+	h = skb_push(skb, hsz);
+	memset(h, 0, hsz);
+	if (blkoff)
+		h->block_offset = htons(blkoff);
+
+	/* network_header current points at the inner IP packet
+	 * move it to the iptfs header
+	 */
+	skb->transport_header = skb->network_header;
+	skb->network_header -= hsz;
+
+	IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
+}
+
+/**
+ * iptfs_copy_create_frag() - create an inner fragment skb.
+ * @st: The source packet data.
+ * @offset: offset in @st of the new fragment data.
+ * @copy_len: the amount of data to copy from @st.
+ *
+ * Create a new skb holding a single IPTFS inner packet fragment. @copy_len must
+ * not be greater than the max fragment size.
+ *
+ * Return: the new fragment skb or an ERR_PTR().
+ */
+static struct sk_buff *iptfs_copy_create_frag(struct skb_seq_state *st, u32 offset, u32 copy_len)
+{
+	struct sk_buff *src = st->root_skb;
+	struct sk_buff *skb;
+	int err;
+
+	skb = iptfs_alloc_skb(src, copy_len, true);
+	if (!skb)
+		return ERR_PTR(-ENOMEM);
+
+	/* Now copy `copy_len` data from src */
+	err = skb_copy_seq_read(st, offset, skb_put(skb, copy_len), copy_len);
+	if (err) {
+		kfree_skb(skb);
+		return ERR_PTR(err);
+	}
+
+	return skb;
+}
+
+/**
+ * iptfs_copy_create_frags() - create and send N-1 fragments of a larger skb.
+ * @skbp: the source packet skb (IN), skb holding the last fragment in
+ *        the fragment stream (OUT).
+ * @xtfs: IPTFS SA state.
+ * @mtu: the max IPTFS fragment size.
+ *
+ * This function is responsible for fragmenting a larger inner packet into a
+ * sequence of IPTFS payload packets. The last fragment is returned rather than
+ * being sent so that the caller can append more inner packets (aggregation) if
+ * there is room.
+ *
+ * Return: 0 on success or a negative error code on failure
+ */
+static int iptfs_copy_create_frags(struct sk_buff **skbp, struct xfrm_iptfs_data *xtfs, u32 mtu)
+{
+	struct skb_seq_state skbseq;
+	struct list_head sublist;
+	struct sk_buff *skb = *skbp;
+	struct sk_buff *nskb = *skbp;
+	u32 copy_len, offset;
+	u32 to_copy = skb->len - mtu;
+	u32 blkoff = 0;
+	int err = 0;
+
+	INIT_LIST_HEAD(&sublist);
+
+	skb_prepare_seq_read(skb, 0, skb->len, &skbseq);
+
+	/* A trimmed `skb` will be sent as the first fragment, later. */
+	offset = mtu;
+	to_copy = skb->len - offset;
+	while (to_copy) {
+		/* Send all but last fragment to allow agg. append */
+		trace_iptfs_first_fragmenting(nskb, mtu, to_copy, NULL);
+		list_add_tail(&nskb->list, &sublist);
+
+		/* FUTURE: if the packet has an odd/non-aligning length we could
+		 * send less data in the penultimate fragment so that the last
+		 * fragment then ends on an aligned boundary.
+		 */
+		copy_len = min(to_copy, mtu);
+		nskb = iptfs_copy_create_frag(&skbseq, offset, copy_len);
+		if (IS_ERR(nskb)) {
+			XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMOUTERROR);
+			skb_abort_seq_read(&skbseq);
+			err = PTR_ERR(nskb);
+			nskb = NULL;
+			break;
+		}
+		iptfs_output_prepare_skb(nskb, to_copy);
+		offset += copy_len;
+		to_copy -= copy_len;
+		blkoff = to_copy;
+	}
+	skb_abort_seq_read(&skbseq);
+
+	/* return last fragment that will be unsent (or NULL) */
+	*skbp = nskb;
+	if (nskb)
+		trace_iptfs_first_final_fragment(nskb, mtu, blkoff, NULL);
+
+	/* trim the original skb to MTU */
+	if (!err)
+		err = pskb_trim(skb, mtu);
+
+	if (err) {
+		/* Free all frags. Don't bother sending a partial packet we will
+		 * never complete.
+		 */
+		kfree_skb(nskb);
+		list_for_each_entry_safe(skb, nskb, &sublist, list) {
+			skb_list_del_init(skb);
+			kfree_skb(skb);
+		}
+		return err;
+	}
+
+	/* prepare the initial fragment with an iptfs header */
+	iptfs_output_prepare_skb(skb, 0);
+
+	/* Send all but last fragment, if we fail to send a fragment then free
+	 * the rest -- no point in sending a packet that can't be reassembled.
+	 */
+	list_for_each_entry_safe(skb, nskb, &sublist, list) {
+		skb_list_del_init(skb);
+		if (!err)
+			err = xfrm_output(NULL, skb);
+		else
+			kfree_skb(skb);
+	}
+	if (err)
+		kfree_skb(*skbp);
+	return err;
+}
+
+/**
+ * iptfs_first_skb() - handle the first dequeued inner packet for output
+ * @skbp: the source packet skb (IN), skb holding the last fragment in
+ *        the fragment stream (OUT).
+ * @xtfs: IPTFS SA state.
+ * @mtu: the max IPTFS fragment size.
+ *
+ * This function is responsible for fragmenting a larger inner packet into a
+ * sequence of IPTFS payload packets.
+ *
+ * The last fragment is returned rather than being sent so that the caller can
+ * append more inner packets (aggregation) if there is room.
+ *
+ * Return: 0 on success or a negative error code on failure
+ */
+static int iptfs_first_skb(struct sk_buff **skbp, struct xfrm_iptfs_data *xtfs, u32 mtu)
+{
+	struct sk_buff *skb = *skbp;
+	int err;
+
+	/* Classic ESP skips the don't fragment ICMP error if DF is clear on
+	 * the inner packet or ignore_df is set. Otherwise it will send an ICMP
+	 * or local error if the inner packet won't fit it's MTU.
+	 *
+	 * With IPTFS we do not care about the inner packet DF bit. If the
+	 * tunnel is configured to "don't fragment" we error back if things
+	 * don't fit in our max packet size. Otherwise we iptfs-fragment as
+	 * normal.
+	 */
+
+	/* The opportunity for HW offload has ended */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		err = skb_checksum_help(skb);
+		if (err)
+			return err;
+	}
+
+	/* We've split gso up before queuing */
+
+	trace_iptfs_first_dequeue(skb, mtu, 0, ip_hdr(skb));
+
+	/* Consider the buffer Tx'd and no longer owned */
+	skb_orphan(skb);
+
+	/* Simple case -- it fits. `mtu` accounted for all the overhead
+	 * including the basic IPTFS header.
+	 */
+	if (skb->len <= mtu) {
+		iptfs_output_prepare_skb(skb, 0);
+		return 0;
+	}
+
+	return iptfs_copy_create_frags(skbp, xtfs, mtu);
+}
+
+static struct sk_buff **iptfs_rehome_fraglist(struct sk_buff **nextp, struct sk_buff *child)
+{
+	u32 fllen = 0;
+
+	/* It might be possible to account for a frag list in addition to page
+	 * fragment if it's a valid state to be in. The page fragments size
+	 * should be kept as data_len so only the frag_list size is removed,
+	 * this must be done above as well.
+	 */
+	*nextp = skb_shinfo(child)->frag_list;
+	while (*nextp) {
+		fllen += (*nextp)->len;
+		nextp = &(*nextp)->next;
+	}
+	skb_frag_list_init(child);
+	child->len -= fllen;
+	child->data_len -= fllen;
+
+	return nextp;
+}
+
+static void iptfs_consume_frags(struct sk_buff *to, struct sk_buff *from)
+{
+	struct skb_shared_info *fromi = skb_shinfo(from);
+	struct skb_shared_info *toi = skb_shinfo(to);
+	unsigned int new_truesize;
+
+	/* If we have data in a head page, grab it */
+	if (!skb_headlen(from)) {
+		new_truesize = SKB_TRUESIZE(skb_end_offset(from));
+	} else {
+		iptfs_skb_head_to_frag(from, &toi->frags[toi->nr_frags]);
+		skb_frag_ref(to, toi->nr_frags++);
+		new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff));
+	}
+
+	/* Move any other page fragments rather than copy */
+	memcpy(&toi->frags[toi->nr_frags], fromi->frags,
+	       sizeof(fromi->frags[0]) * fromi->nr_frags);
+	toi->nr_frags += fromi->nr_frags;
+	fromi->nr_frags = 0;
+	from->data_len = 0;
+	from->len = 0;
+	to->truesize += from->truesize - new_truesize;
+	from->truesize = new_truesize;
+
+	/* We are done with this SKB */
+	consume_skb(from);
+}
+
+static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list)
+{
+	struct xfrm_iptfs_data *xtfs = x->mode_data;
+	struct sk_buff *skb, *skb2, **nextp;
+	struct skb_shared_info *shi, *shi2;
+
+	/* If we are fragmenting due to a large inner packet we will output all
+	 * the outer IPTFS packets required to contain the fragments of the
+	 * single large inner packet. These outer packets need to be sent
+	 * consecutively (ESP seq-wise). Since this output function is always
+	 * running from a timer we do not need a lock to provide this guarantee.
+	 * We will output our packets consecutively before the timer is allowed
+	 * to run again on some other CPU.
+	 */
+
+	while ((skb = __skb_dequeue(list))) {
+		u32 mtu = iptfs_get_cur_pmtu(x, xtfs, skb);
+		bool share_ok = true;
+		int remaining;
+
+		/* protocol comes to us cleared sometimes */
+		skb->protocol = x->outer_mode.family == AF_INET ? htons(ETH_P_IP) :
+								  htons(ETH_P_IPV6);
+
+		if (skb->len > mtu && xtfs->cfg.dont_frag) {
+			/* We handle this case before enqueueing so we are only
+			 * here b/c MTU changed after we enqueued before we
+			 * dequeued, just drop these.
+			 */
+			XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTERROR);
+
+			trace_iptfs_first_toobig(skb, mtu, 0, ip_hdr(skb));
+			kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
+			continue;
+		}
+
+		/* Convert first inner packet into an outer IPTFS packet,
+		 * dealing with any fragmentation into multiple outer packets
+		 * if necessary.
+		 */
+		if (iptfs_first_skb(&skb, xtfs, mtu))
+			continue;
+
+		/* If fragmentation was required the returned skb is the last
+		 * IPTFS fragment in the chain, and it's IPTFS header blkoff has
+		 * been set just past the end of the fragment data.
+		 *
+		 * In either case the space remaining to send more inner packet
+		 * data is `mtu` - (skb->len - sizeof iptfs header). This is b/c
+		 * the `mtu` value has the basic IPTFS header len accounted for,
+		 * and we added that header to the skb so it is a part of
+		 * skb->len, thus we subtract it from the skb length.
+		 */
+		remaining = mtu - (skb->len - sizeof(struct ip_iptfs_hdr));
+
+		/* Re-home (un-nest) nested fragment lists. We need to do this
+		 * b/c we will simply be appending any following aggregated
+		 * inner packets using the frag list.
+		 */
+		shi = skb_shinfo(skb);
+		nextp = &shi->frag_list;
+		while (*nextp) {
+			if (skb_has_frag_list(*nextp))
+				nextp = iptfs_rehome_fraglist(&(*nextp)->next, *nextp);
+			else
+				nextp = &(*nextp)->next;
+		}
+
+		if (shi->frag_list || skb_cloned(skb) || skb_shared(skb))
+			share_ok = false;
+
+		/* See if we have enough space to simply append.
+		 *
+		 * NOTE: Maybe do not append if we will be mis-aligned,
+		 * SW-based endpoints will probably have to copy in this
+		 * case.
+		 */
+		while ((skb2 = skb_peek(list))) {
+			trace_iptfs_ingress_nth_peek(skb2, remaining);
+			if (skb2->len > remaining)
+				break;
+
+			__skb_unlink(skb2, list);
+
+			/* Consider the buffer Tx'd and no longer owned */
+			skb_orphan(skb);
+
+			/* If we don't have a cksum in the packet we need to add
+			 * one before encapsulation.
+			 */
+			if (skb2->ip_summed == CHECKSUM_PARTIAL) {
+				if (skb_checksum_help(skb2)) {
+					XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTERROR);
+					kfree_skb(skb2);
+					continue;
+				}
+			}
+
+			/* skb->pp_recycle is passed to __skb_flag_unref for all
+			 * frag pages so we can only share pages with skb's who
+			 * match ourselves.
+			 */
+			shi2 = skb_shinfo(skb2);
+			if (share_ok &&
+			    (shi2->frag_list ||
+			     (!skb2->head_frag && skb_headlen(skb)) ||
+			     skb->pp_recycle != skb2->pp_recycle ||
+			     skb_zcopy(skb2) ||
+			     (shi->nr_frags + shi2->nr_frags + 1 > MAX_SKB_FRAGS)))
+				share_ok = false;
+
+			/* Do accounting */
+			skb->data_len += skb2->len;
+			skb->len += skb2->len;
+			remaining -= skb2->len;
+
+			trace_iptfs_ingress_nth_add(skb2, share_ok);
+
+			if (share_ok) {
+				iptfs_consume_frags(skb, skb2);
+			} else {
+				/* Append to the frag_list */
+				*nextp = skb2;
+				nextp = &skb2->next;
+				if (skb_has_frag_list(skb2))
+					nextp = iptfs_rehome_fraglist(nextp,
+								      skb2);
+				skb->truesize += skb2->truesize;
+			}
+		}
+
+		xfrm_output(NULL, skb);
+	}
+}
+
+static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me)
+{
+	struct sk_buff_head list;
+	struct xfrm_iptfs_data *xtfs;
+	struct xfrm_state *x;
+	time64_t settime;
+
+	xtfs = container_of(me, typeof(*xtfs), iptfs_timer);
+	x = xtfs->x;
+
+	/* Process all the queued packets
+	 *
+	 * softirq execution order: timer > tasklet > hrtimer
+	 *
+	 * Network rx will have run before us giving one last chance to queue
+	 * ingress packets for us to process and transmit.
+	 */
+
+	spin_lock(&x->lock);
+	__skb_queue_head_init(&list);
+	skb_queue_splice_init(&xtfs->queue, &list);
+	xtfs->queue_size = 0;
+	settime = xtfs->iptfs_settime;
+	spin_unlock(&x->lock);
+
+	/* After the above unlock, packets can begin queuing again, and the
+	 * timer can be set again, from another CPU either in softirq or user
+	 * context (not from this one since we are running at softirq level
+	 * already).
+	 */
+
+	trace_iptfs_timer_expire(xtfs, (unsigned long long)(ktime_get_raw_fast_ns() - settime));
+
+	iptfs_output_queued(x, &list);
+
+	return HRTIMER_NORESTART;
+}
+
+/**
+ * iptfs_encap_add_ipv4() - add outer encaps
+ * @x: xfrm state
+ * @skb: the packet
+ *
+ * This was originally taken from xfrm4_tunnel_encap_add. The reason for the
+ * copy is that IP-TFS/AGGFRAG can have different functionality for how to set
+ * the TOS/DSCP bits. Sets the protocol to a different value and doesn't do
+ * anything with inner headers as they aren't pointing into a normal IP
+ * singleton inner packet.
+ *
+ * Return: 0 on success or a negative error code on failure
+ */
+static int iptfs_encap_add_ipv4(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct iphdr *top_iph;
+
+	skb_reset_inner_network_header(skb);
+	skb_reset_inner_transport_header(skb);
+
+	skb_set_network_header(skb, -(x->props.header_len - x->props.enc_hdr_len));
+	skb->mac_header = skb->network_header + offsetof(struct iphdr, protocol);
+	skb->transport_header = skb->network_header + sizeof(*top_iph);
+
+	top_iph = ip_hdr(skb);
+	top_iph->ihl = 5;
+	top_iph->version = 4;
+	top_iph->protocol = IPPROTO_AGGFRAG;
+
+	/* As we have 0, fractional, 1 or N inner packets there's no obviously
+	 * correct DSCP mapping to inherit. ECN should be cleared per RFC9347
+	 * 3.1.
+	 */
+	top_iph->tos = 0;
+
+	top_iph->frag_off = htons(IP_DF);
+	top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst));
+	top_iph->saddr = x->props.saddr.a4;
+	top_iph->daddr = x->id.daddr.a4;
+	ip_select_ident(dev_net(dst->dev), skb, NULL);
+
+	return 0;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/**
+ * iptfs_encap_add_ipv6() - add outer encaps
+ * @x: xfrm state
+ * @skb: the packet
+ *
+ * This was originally taken from xfrm6_tunnel_encap_add. The reason for the
+ * copy is that IP-TFS/AGGFRAG can have different functionality for how to set
+ * the flow label and TOS/DSCP bits. It also sets the protocol to a different
+ * value and doesn't do anything with inner headers as they aren't pointing into
+ * a normal IP singleton inner packet.
+ *
+ * Return: 0 on success or a negative error code on failure
+ */
+static int iptfs_encap_add_ipv6(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct ipv6hdr *top_iph;
+	int dsfield;
+
+	skb_reset_inner_network_header(skb);
+	skb_reset_inner_transport_header(skb);
+
+	skb_set_network_header(skb, -x->props.header_len + x->props.enc_hdr_len);
+	skb->mac_header = skb->network_header + offsetof(struct ipv6hdr, nexthdr);
+	skb->transport_header = skb->network_header + sizeof(*top_iph);
+
+	top_iph = ipv6_hdr(skb);
+	top_iph->version = 6;
+	top_iph->priority = 0;
+	memset(top_iph->flow_lbl, 0, sizeof(top_iph->flow_lbl));
+	top_iph->nexthdr = IPPROTO_AGGFRAG;
+
+	/* As we have 0, fractional, 1 or N inner packets there's no obviously
+	 * correct DSCP mapping to inherit. ECN should be cleared per RFC9347
+	 * 3.1.
+	 */
+	dsfield = 0;
+	ipv6_change_dsfield(top_iph, 0, dsfield);
+
+	top_iph->hop_limit = ip6_dst_hoplimit(xfrm_dst_child(dst));
+	top_iph->saddr = *(struct in6_addr *)&x->props.saddr;
+	top_iph->daddr = *(struct in6_addr *)&x->id.daddr;
+
+	return 0;
+}
+#endif
+
+/**
+ * iptfs_prepare_output() -  prepare the skb for output
+ * @x: xfrm state
+ * @skb: the packet
+ *
+ * Return: Error value, if 0 then skb values should be as follows:
+ *    - transport_header should point at ESP header
+ *    - network_header should point at Outer IP header
+ *    - mac_header should point at protocol/nexthdr of the outer IP
+ */
+static int iptfs_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	if (x->outer_mode.family == AF_INET)
+		return iptfs_encap_add_ipv4(x, skb);
+	if (x->outer_mode.family == AF_INET6) {
+#if IS_ENABLED(CONFIG_IPV6)
+		return iptfs_encap_add_ipv6(x, skb);
+#else
+		return -EAFNOSUPPORT;
+#endif
+	}
+	return -EOPNOTSUPP;
+}
+
+/* ========================== */
+/* State Management Functions */
+/* ========================== */
+
+/**
+ * __iptfs_get_inner_mtu() - return inner MTU with no fragmentation.
+ * @x: xfrm state.
+ * @outer_mtu: the outer mtu
+ *
+ * Return: Correct MTU taking in to account the encap overhead.
+ */
+static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu)
+{
+	struct crypto_aead *aead;
+	u32 blksize;
+
+	aead = x->data;
+	blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+	return ((outer_mtu - x->props.header_len - crypto_aead_authsize(aead)) &
+		~(blksize - 1)) - 2;
+}
+
+/**
+ * iptfs_get_inner_mtu() - return the inner MTU for an IPTFS xfrm.
+ * @x: xfrm state.
+ * @outer_mtu: Outer MTU for the encapsulated packet.
+ *
+ * Return: Correct MTU taking in to account the encap overhead.
+ */
+static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu)
+{
+	struct xfrm_iptfs_data *xtfs = x->mode_data;
+
+	/* If not dont-frag we have no MTU */
+	if (!xtfs->cfg.dont_frag)
+		return x->outer_mode.family == AF_INET ? IP_MAX_MTU : IP6_MAX_MTU;
+	return __iptfs_get_inner_mtu(x, outer_mtu);
+}
+
+/**
+ * iptfs_user_init() - initialize the SA with IPTFS options from netlink.
+ * @net: the net data
+ * @x: xfrm state
+ * @attrs: netlink attributes
+ * @extack: extack return data
+ *
+ * Return: 0 on success or a negative error code on failure
+ */
+static int iptfs_user_init(struct net *net, struct xfrm_state *x,
+			   struct nlattr **attrs,
+			   struct netlink_ext_ack *extack)
+{
+	struct xfrm_iptfs_data *xtfs = x->mode_data;
+	struct xfrm_iptfs_config *xc;
+	u64 q;
+
+	xc = &xtfs->cfg;
+	xc->max_queue_size = IPTFS_DEFAULT_MAX_QUEUE_SIZE;
+	xc->reorder_win_size = IPTFS_DEFAULT_REORDER_WINDOW;
+	xtfs->drop_time_ns = IPTFS_DEFAULT_DROP_TIME_USECS * NSECS_IN_USEC;
+	xtfs->init_delay_ns = IPTFS_DEFAULT_INIT_DELAY_USECS * NSECS_IN_USEC;
+
+	if (attrs[XFRMA_IPTFS_DONT_FRAG])
+		xc->dont_frag = true;
+	if (attrs[XFRMA_IPTFS_REORDER_WINDOW])
+		xc->reorder_win_size =
+			nla_get_u16(attrs[XFRMA_IPTFS_REORDER_WINDOW]);
+	/* saved array is for saving 1..N seq nums from wantseq */
+	if (xc->reorder_win_size) {
+		xtfs->w_saved = kcalloc(xc->reorder_win_size,
+					sizeof(*xtfs->w_saved), GFP_KERNEL);
+		if (!xtfs->w_saved) {
+			NL_SET_ERR_MSG(extack, "Cannot alloc reorder window");
+			return -ENOMEM;
+		}
+	}
+	if (attrs[XFRMA_IPTFS_PKT_SIZE]) {
+		xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]);
+		if (!xc->pkt_size) {
+			xtfs->payload_mtu = 0;
+		} else if (xc->pkt_size > x->props.header_len) {
+			xtfs->payload_mtu = xc->pkt_size - x->props.header_len;
+		} else {
+			NL_SET_ERR_MSG(extack,
+				       "Packet size must be 0 or greater than IPTFS/ESP header length");
+			return -EINVAL;
+		}
+	}
+	if (attrs[XFRMA_IPTFS_MAX_QSIZE])
+		xc->max_queue_size = nla_get_u32(attrs[XFRMA_IPTFS_MAX_QSIZE]);
+	if (attrs[XFRMA_IPTFS_DROP_TIME])
+		xtfs->drop_time_ns =
+			(u64)nla_get_u32(attrs[XFRMA_IPTFS_DROP_TIME]) *
+			NSECS_IN_USEC;
+	if (attrs[XFRMA_IPTFS_INIT_DELAY])
+		xtfs->init_delay_ns =
+			(u64)nla_get_u32(attrs[XFRMA_IPTFS_INIT_DELAY]) * NSECS_IN_USEC;
+
+	q = (u64)xc->max_queue_size * 95;
+	do_div(q, 100);
+	xtfs->ecn_queue_size = (u32)q;
+
+	return 0;
+}
+
+static unsigned int iptfs_sa_len(const struct xfrm_state *x)
+{
+	struct xfrm_iptfs_data *xtfs = x->mode_data;
+	struct xfrm_iptfs_config *xc = &xtfs->cfg;
+	unsigned int l = 0;
+
+	if (x->dir == XFRM_SA_DIR_IN) {
+		l += nla_total_size(sizeof(u32)); /* drop time usec */
+		l += nla_total_size(sizeof(xc->reorder_win_size));
+	} else {
+		if (xc->dont_frag)
+			l += nla_total_size(0);	  /* dont-frag flag */
+		l += nla_total_size(sizeof(u32)); /* init delay usec */
+		l += nla_total_size(sizeof(xc->max_queue_size));
+		l += nla_total_size(sizeof(xc->pkt_size));
+	}
+
+	return l;
+}
+
+static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct xfrm_iptfs_data *xtfs = x->mode_data;
+	struct xfrm_iptfs_config *xc = &xtfs->cfg;
+	int ret = 0;
+	u64 q;
+
+	if (x->dir == XFRM_SA_DIR_IN) {
+		q = xtfs->drop_time_ns;
+		do_div(q, NSECS_IN_USEC);
+		ret = nla_put_u32(skb, XFRMA_IPTFS_DROP_TIME, q);
+		if (ret)
+			return ret;
+
+		ret = nla_put_u16(skb, XFRMA_IPTFS_REORDER_WINDOW,
+				  xc->reorder_win_size);
+	} else {
+		if (xc->dont_frag) {
+			ret = nla_put_flag(skb, XFRMA_IPTFS_DONT_FRAG);
+			if (ret)
+				return ret;
+		}
+
+		q = xtfs->init_delay_ns;
+		do_div(q, NSECS_IN_USEC);
+		ret = nla_put_u32(skb, XFRMA_IPTFS_INIT_DELAY, q);
+		if (ret)
+			return ret;
+
+		ret = nla_put_u32(skb, XFRMA_IPTFS_MAX_QSIZE, xc->max_queue_size);
+		if (ret)
+			return ret;
+
+		ret = nla_put_u32(skb, XFRMA_IPTFS_PKT_SIZE, xc->pkt_size);
+	}
+
+	return ret;
+}
+
+static void __iptfs_init_state(struct xfrm_state *x,
+			       struct xfrm_iptfs_data *xtfs)
+{
+	__skb_queue_head_init(&xtfs->queue);
+	hrtimer_init(&xtfs->iptfs_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE);
+	xtfs->iptfs_timer.function = iptfs_delay_timer;
+
+	spin_lock_init(&xtfs->drop_lock);
+	hrtimer_init(&xtfs->drop_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE);
+	xtfs->drop_timer.function = iptfs_drop_timer;
+
+	/* Modify type (esp) adjustment values */
+
+	if (x->props.family == AF_INET)
+		x->props.header_len += sizeof(struct iphdr) + sizeof(struct ip_iptfs_hdr);
+	else if (x->props.family == AF_INET6)
+		x->props.header_len += sizeof(struct ipv6hdr) + sizeof(struct ip_iptfs_hdr);
+	x->props.enc_hdr_len = sizeof(struct ip_iptfs_hdr);
+
+	/* Always keep a module reference when x->mode_data is set */
+	__module_get(x->mode_cbs->owner);
+
+	x->mode_data = xtfs;
+	xtfs->x = x;
+}
+
+static int iptfs_clone_state(struct xfrm_state *x, struct xfrm_state *orig)
+{
+	struct xfrm_iptfs_data *xtfs;
+
+	xtfs = kmemdup(orig->mode_data, sizeof(*xtfs), GFP_KERNEL);
+	if (!xtfs)
+		return -ENOMEM;
+
+	x->mode_data = xtfs;
+	xtfs->x = x;
+
+	xtfs->ra_newskb = NULL;
+	if (xtfs->cfg.reorder_win_size) {
+		xtfs->w_saved = kcalloc(xtfs->cfg.reorder_win_size,
+					sizeof(*xtfs->w_saved), GFP_KERNEL);
+		if (!xtfs->w_saved) {
+			kfree_sensitive(xtfs);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static int iptfs_init_state(struct xfrm_state *x)
+{
+	struct xfrm_iptfs_data *xtfs;
+
+	if (x->mode_data) {
+		/* We have arrived here from xfrm_state_clone() */
+		xtfs = x->mode_data;
+	} else {
+		xtfs = kzalloc(sizeof(*xtfs), GFP_KERNEL);
+		if (!xtfs)
+			return -ENOMEM;
+	}
+
+	__iptfs_init_state(x, xtfs);
+
+	return 0;
+}
+
+static void iptfs_destroy_state(struct xfrm_state *x)
+{
+	struct xfrm_iptfs_data *xtfs = x->mode_data;
+	struct sk_buff_head list;
+	struct skb_wseq *s, *se;
+	struct sk_buff *skb;
+
+	if (!xtfs)
+		return;
+
+	spin_lock_bh(&xtfs->x->lock);
+	hrtimer_cancel(&xtfs->iptfs_timer);
+	__skb_queue_head_init(&list);
+	skb_queue_splice_init(&xtfs->queue, &list);
+	spin_unlock_bh(&xtfs->x->lock);
+
+	while ((skb = __skb_dequeue(&list)))
+		kfree_skb(skb);
+
+	spin_lock_bh(&xtfs->drop_lock);
+	hrtimer_cancel(&xtfs->drop_timer);
+	spin_unlock_bh(&xtfs->drop_lock);
+
+	if (xtfs->ra_newskb)
+		kfree_skb(xtfs->ra_newskb);
+
+	for (s = xtfs->w_saved, se = s + xtfs->w_savedlen; s < se; s++) {
+		if (s->skb)
+			kfree_skb(s->skb);
+	}
+
+	kfree_sensitive(xtfs->w_saved);
+	kfree_sensitive(xtfs);
+
+	module_put(x->mode_cbs->owner);
+}
+
+static const struct xfrm_mode_cbs iptfs_mode_cbs = {
+	.owner = THIS_MODULE,
+	.init_state = iptfs_init_state,
+	.clone_state = iptfs_clone_state,
+	.destroy_state = iptfs_destroy_state,
+	.user_init = iptfs_user_init,
+	.copy_to_user = iptfs_copy_to_user,
+	.sa_len = iptfs_sa_len,
+	.get_inner_mtu = iptfs_get_inner_mtu,
+	.input = iptfs_input,
+	.output = iptfs_output_collect,
+	.prepare_output = iptfs_prepare_output,
+};
+
+static int __init xfrm_iptfs_init(void)
+{
+	int err;
+
+	pr_info("xfrm_iptfs: IPsec IP-TFS tunnel mode module\n");
+
+	err = xfrm_register_mode_cbs(XFRM_MODE_IPTFS, &iptfs_mode_cbs);
+	if (err < 0)
+		pr_info("%s: can't register IP-TFS\n", __func__);
+
+	return err;
+}
+
+static void __exit xfrm_iptfs_fini(void)
+{
+	xfrm_unregister_mode_cbs(XFRM_MODE_IPTFS);
+}
+
+module_init(xfrm_iptfs_init);
+module_exit(xfrm_iptfs_fini);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IP-TFS support for xfrm ipsec tunnels");
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index e5722c95b8bb..b5025cf6136e 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -472,6 +472,8 @@ static int xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb)
 		WARN_ON_ONCE(1);
 		break;
 	default:
+		if (x->mode_cbs && x->mode_cbs->prepare_output)
+			return x->mode_cbs->prepare_output(x, skb);
 		WARN_ON_ONCE(1);
 		break;
 	}
@@ -675,6 +677,10 @@ static void xfrm_get_inner_ipproto(struct sk_buff *skb, struct xfrm_state *x)
 
 		return;
 	}
+	if (x->outer_mode.encap == XFRM_MODE_IPTFS) {
+		xo->inner_ipproto = IPPROTO_AGGFRAG;
+		return;
+	}
 
 	/* non-Tunnel Mode */
 	if (!skb->encapsulation)
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 4408c11c0835..9e510021ee91 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2497,6 +2497,7 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
 		struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i];
 
 		if (tmpl->mode == XFRM_MODE_TUNNEL ||
+		    tmpl->mode == XFRM_MODE_IPTFS ||
 		    tmpl->mode == XFRM_MODE_BEET) {
 			remote = &tmpl->id.daddr;
 			local = &tmpl->saddr;
@@ -2748,13 +2749,17 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 
 		dst1->input = dst_discard;
 
-		rcu_read_lock();
-		afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family);
-		if (likely(afinfo))
-			dst1->output = afinfo->output;
-		else
-			dst1->output = dst_discard_out;
-		rcu_read_unlock();
+		if (xfrm[i]->mode_cbs && xfrm[i]->mode_cbs->output) {
+			dst1->output = xfrm[i]->mode_cbs->output;
+		} else {
+			rcu_read_lock();
+			afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family);
+			if (likely(afinfo))
+				dst1->output = afinfo->output;
+			else
+				dst1->output = dst_discard_out;
+			rcu_read_unlock();
+		}
 
 		xdst_prev = xdst;
 
@@ -3290,7 +3295,8 @@ no_transform:
 ok:
 	xfrm_pols_put(pols, drop_pols);
 	if (dst && dst->xfrm &&
-	    dst->xfrm->props.mode == XFRM_MODE_TUNNEL)
+	    (dst->xfrm->props.mode == XFRM_MODE_TUNNEL ||
+	     dst->xfrm->props.mode == XFRM_MODE_IPTFS))
 		dst->flags |= DST_XFRM_TUNNEL;
 	return dst;
 
@@ -4519,6 +4525,7 @@ static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tm
 		switch (t->mode) {
 		case XFRM_MODE_TUNNEL:
 		case XFRM_MODE_BEET:
+		case XFRM_MODE_IPTFS:
 			if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr,
 					    m->old_family) &&
 			    xfrm_addr_equal(&t->saddr, &m->old_saddr,
@@ -4561,7 +4568,8 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol,
 				continue;
 			n++;
 			if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL &&
-			    pol->xfrm_vec[i].mode != XFRM_MODE_BEET)
+			    pol->xfrm_vec[i].mode != XFRM_MODE_BEET &&
+			    pol->xfrm_vec[i].mode != XFRM_MODE_IPTFS)
 				continue;
 			/* update endpoints */
 			memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr,
diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c
index eeb984be03a7..8e07dd614b0b 100644
--- a/net/xfrm/xfrm_proc.c
+++ b/net/xfrm/xfrm_proc.c
@@ -43,6 +43,8 @@ static const struct snmp_mib xfrm_mib_list[] = {
 	SNMP_MIB_ITEM("XfrmAcquireError", LINUX_MIB_XFRMACQUIREERROR),
 	SNMP_MIB_ITEM("XfrmOutStateDirError", LINUX_MIB_XFRMOUTSTATEDIRERROR),
 	SNMP_MIB_ITEM("XfrmInStateDirError", LINUX_MIB_XFRMINSTATEDIRERROR),
+	SNMP_MIB_ITEM("XfrmInIptfsError", LINUX_MIB_XFRMINIPTFSERROR),
+	SNMP_MIB_ITEM("XfrmOutNoQueueSpace", LINUX_MIB_XFRMOUTNOQSPACE),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
index bc56c6305725..e500aebbad22 100644
--- a/net/xfrm/xfrm_replay.c
+++ b/net/xfrm/xfrm_replay.c
@@ -729,6 +729,7 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff
 		}
 
 		replay_esn->oseq = oseq;
+		xfrm_dev_state_advance_esn(x);
 
 		if (xfrm_aevent_is_on(net))
 			xfrm_replay_notify(x, XFRM_REPLAY_UPDATE);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 67ca7ac955a3..34067cb8a479 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -467,6 +467,11 @@ static const struct xfrm_mode xfrm4_mode_map[XFRM_MODE_MAX] = {
 		.flags = XFRM_MODE_FLAG_TUNNEL,
 		.family = AF_INET,
 	},
+	[XFRM_MODE_IPTFS] = {
+		.encap = XFRM_MODE_IPTFS,
+		.flags = XFRM_MODE_FLAG_TUNNEL,
+		.family = AF_INET,
+	},
 };
 
 static const struct xfrm_mode xfrm6_mode_map[XFRM_MODE_MAX] = {
@@ -488,6 +493,11 @@ static const struct xfrm_mode xfrm6_mode_map[XFRM_MODE_MAX] = {
 		.flags = XFRM_MODE_FLAG_TUNNEL,
 		.family = AF_INET6,
 	},
+	[XFRM_MODE_IPTFS] = {
+		.encap = XFRM_MODE_IPTFS,
+		.flags = XFRM_MODE_FLAG_TUNNEL,
+		.family = AF_INET6,
+	},
 };
 
 static const struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family)
@@ -515,6 +525,60 @@ static const struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family)
 	return NULL;
 }
 
+static const struct xfrm_mode_cbs  __rcu *xfrm_mode_cbs_map[XFRM_MODE_MAX];
+static DEFINE_SPINLOCK(xfrm_mode_cbs_map_lock);
+
+int xfrm_register_mode_cbs(u8 mode, const struct xfrm_mode_cbs *mode_cbs)
+{
+	if (mode >= XFRM_MODE_MAX)
+		return -EINVAL;
+
+	spin_lock_bh(&xfrm_mode_cbs_map_lock);
+	rcu_assign_pointer(xfrm_mode_cbs_map[mode], mode_cbs);
+	spin_unlock_bh(&xfrm_mode_cbs_map_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(xfrm_register_mode_cbs);
+
+void xfrm_unregister_mode_cbs(u8 mode)
+{
+	if (mode >= XFRM_MODE_MAX)
+		return;
+
+	spin_lock_bh(&xfrm_mode_cbs_map_lock);
+	RCU_INIT_POINTER(xfrm_mode_cbs_map[mode], NULL);
+	spin_unlock_bh(&xfrm_mode_cbs_map_lock);
+	synchronize_rcu();
+}
+EXPORT_SYMBOL(xfrm_unregister_mode_cbs);
+
+static const struct xfrm_mode_cbs *xfrm_get_mode_cbs(u8 mode)
+{
+	const struct xfrm_mode_cbs *cbs;
+	bool try_load = true;
+
+	if (mode >= XFRM_MODE_MAX)
+		return NULL;
+
+retry:
+	rcu_read_lock();
+
+	cbs = rcu_dereference(xfrm_mode_cbs_map[mode]);
+	if (cbs && !try_module_get(cbs->owner))
+		cbs = NULL;
+
+	rcu_read_unlock();
+
+	if (mode == XFRM_MODE_IPTFS && !cbs && try_load) {
+		request_module("xfrm-iptfs");
+		try_load = false;
+		goto retry;
+	}
+
+	return cbs;
+}
+
 void xfrm_state_free(struct xfrm_state *x)
 {
 	kmem_cache_free(xfrm_state_cache, x);
@@ -523,6 +587,8 @@ EXPORT_SYMBOL(xfrm_state_free);
 
 static void ___xfrm_state_destroy(struct xfrm_state *x)
 {
+	if (x->mode_cbs && x->mode_cbs->destroy_state)
+		x->mode_cbs->destroy_state(x);
 	hrtimer_cancel(&x->mtimer);
 	del_timer_sync(&x->rtimer);
 	kfree(x->aead);
@@ -682,6 +748,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net)
 		x->replay_maxdiff = 0;
 		x->pcpu_num = UINT_MAX;
 		spin_lock_init(&x->lock);
+		x->mode_data = NULL;
 	}
 	return x;
 }
@@ -1945,6 +2012,12 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig,
 	x->new_mapping_sport = 0;
 	x->dir = orig->dir;
 
+	x->mode_cbs = orig->mode_cbs;
+	if (x->mode_cbs && x->mode_cbs->clone_state) {
+		if (x->mode_cbs->clone_state(x, orig))
+			goto error;
+	}
+
 	return x;
 
  error:
@@ -2271,6 +2344,7 @@ static int __xfrm6_state_sort_cmp(const void *p)
 #endif
 	case XFRM_MODE_TUNNEL:
 	case XFRM_MODE_BEET:
+	case XFRM_MODE_IPTFS:
 		return 4;
 	}
 	return 5;
@@ -2297,6 +2371,7 @@ static int __xfrm6_tmpl_sort_cmp(const void *p)
 #endif
 	case XFRM_MODE_TUNNEL:
 	case XFRM_MODE_BEET:
+	case XFRM_MODE_IPTFS:
 		return 3;
 	}
 	return 4;
@@ -2986,6 +3061,9 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu)
 	case XFRM_MODE_TUNNEL:
 		break;
 	default:
+		if (x->mode_cbs && x->mode_cbs->get_inner_mtu)
+			return x->mode_cbs->get_inner_mtu(x, mtu);
+
 		WARN_ON_ONCE(1);
 		break;
 	}
@@ -3086,6 +3164,12 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload,
 		}
 	}
 
+	x->mode_cbs = xfrm_get_mode_cbs(x->props.mode);
+	if (x->mode_cbs) {
+		if (x->mode_cbs->init_state)
+			err = x->mode_cbs->init_state(x);
+		module_put(x->mode_cbs->owner);
+	}
 error:
 	return err;
 }
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index b2876e09328b..08c6d6f0179f 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -301,6 +301,16 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 			NL_SET_ERR_MSG(extack, "TFC padding can only be used in tunnel mode");
 			goto out;
 		}
+		if ((attrs[XFRMA_IPTFS_DROP_TIME] ||
+		     attrs[XFRMA_IPTFS_REORDER_WINDOW] ||
+		     attrs[XFRMA_IPTFS_DONT_FRAG] ||
+		     attrs[XFRMA_IPTFS_INIT_DELAY] ||
+		     attrs[XFRMA_IPTFS_MAX_QSIZE] ||
+		     attrs[XFRMA_IPTFS_PKT_SIZE]) &&
+		    p->mode != XFRM_MODE_IPTFS) {
+			NL_SET_ERR_MSG(extack, "IP-TFS options can only be used in IP-TFS mode");
+			goto out;
+		}
 		break;
 
 	case IPPROTO_COMP:
@@ -373,6 +383,16 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 	case XFRM_MODE_ROUTEOPTIMIZATION:
 	case XFRM_MODE_BEET:
 		break;
+	case XFRM_MODE_IPTFS:
+		if (p->id.proto != IPPROTO_ESP) {
+			NL_SET_ERR_MSG(extack, "IP-TFS mode only supported with ESP");
+			goto out;
+		}
+		if (sa_dir == 0) {
+			NL_SET_ERR_MSG(extack, "IP-TFS mode requires in or out direction attribute");
+			goto out;
+		}
+		break;
 
 	default:
 		NL_SET_ERR_MSG(extack, "Unsupported mode");
@@ -421,6 +441,18 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 			goto out;
 		}
 
+		if (attrs[XFRMA_IPTFS_DROP_TIME]) {
+			NL_SET_ERR_MSG(extack, "IP-TFS drop time should not be set for output SA");
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (attrs[XFRMA_IPTFS_REORDER_WINDOW]) {
+			NL_SET_ERR_MSG(extack, "IP-TFS reorder window should not be set for output SA");
+			err = -EINVAL;
+			goto out;
+		}
+
 		if (attrs[XFRMA_REPLAY_VAL]) {
 			struct xfrm_replay_state *replay;
 
@@ -458,6 +490,30 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 			}
 
 		}
+
+		if (attrs[XFRMA_IPTFS_DONT_FRAG]) {
+			NL_SET_ERR_MSG(extack, "IP-TFS don't fragment should not be set for input SA");
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (attrs[XFRMA_IPTFS_INIT_DELAY]) {
+			NL_SET_ERR_MSG(extack, "IP-TFS initial delay should not be set for input SA");
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (attrs[XFRMA_IPTFS_MAX_QSIZE]) {
+			NL_SET_ERR_MSG(extack, "IP-TFS max queue size should not be set for input SA");
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (attrs[XFRMA_IPTFS_PKT_SIZE]) {
+			NL_SET_ERR_MSG(extack, "IP-TFS packet size should not be set for input SA");
+			err = -EINVAL;
+			goto out;
+		}
 	}
 
 	if (!sa_dir && attrs[XFRMA_SA_PCPU]) {
@@ -886,6 +942,12 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
 			goto error;
 	}
 
+	if (x->mode_cbs && x->mode_cbs->user_init) {
+		err = x->mode_cbs->user_init(net, x, attrs, extack);
+		if (err)
+			goto error;
+	}
+
 	return x;
 
 error:
@@ -1301,6 +1363,10 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
 		if (ret)
 			goto out;
 	}
+	if (x->mode_cbs && x->mode_cbs->copy_to_user)
+		ret = x->mode_cbs->copy_to_user(x, skb);
+	if (ret)
+		goto out;
 	if (x->mapping_maxage) {
 		ret = nla_put_u32(skb, XFRMA_MTIMER_THRESH, x->mapping_maxage);
 		if (ret)
@@ -1958,6 +2024,8 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family,
 				return -EINVAL;
 			}
 			break;
+		case XFRM_MODE_IPTFS:
+			break;
 		default:
 			if (ut[i].family != prev_family) {
 				NL_SET_ERR_MSG(extack, "Mode in template doesn't support a family change");
@@ -3220,6 +3288,12 @@ const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
 	[XFRMA_SA_DIR]          = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT),
 	[XFRMA_NAT_KEEPALIVE_INTERVAL] = { .type = NLA_U32 },
 	[XFRMA_SA_PCPU]		= { .type = NLA_U32 },
+	[XFRMA_IPTFS_DROP_TIME]		= { .type = NLA_U32 },
+	[XFRMA_IPTFS_REORDER_WINDOW]	= { .type = NLA_U16 },
+	[XFRMA_IPTFS_DONT_FRAG]		= { .type = NLA_FLAG },
+	[XFRMA_IPTFS_INIT_DELAY]	= { .type = NLA_U32 },
+	[XFRMA_IPTFS_MAX_QSIZE]		= { .type = NLA_U32 },
+	[XFRMA_IPTFS_PKT_SIZE]	= { .type = NLA_U32 },
 };
 EXPORT_SYMBOL_GPL(xfrma_policy);
 
@@ -3554,6 +3628,9 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x)
 	if (x->nat_keepalive_interval)
 		l += nla_total_size(sizeof(x->nat_keepalive_interval));
 
+	if (x->mode_cbs && x->mode_cbs->sa_len)
+		l += x->mode_cbs->sa_len(x);
+
 	return l;
 }
 
diff --git a/rust/kernel/workqueue.rs b/rust/kernel/workqueue.rs
index 4d1d2062f6eb..fd3e97192ed8 100644
--- a/rust/kernel/workqueue.rs
+++ b/rust/kernel/workqueue.rs
@@ -519,7 +519,15 @@ impl_has_work! {
     impl{T} HasWork<Self> for ClosureWork<T> { self.work }
 }
 
-// SAFETY: TODO.
+// SAFETY: The `__enqueue` implementation in RawWorkItem uses a `work_struct` initialized with the
+// `run` method of this trait as the function pointer because:
+//   - `__enqueue` gets the `work_struct` from the `Work` field, using `T::raw_get_work`.
+//   - The only safe way to create a `Work` object is through `Work::new`.
+//   - `Work::new` makes sure that `T::Pointer::run` is passed to `init_work_with_key`.
+//   - Finally `Work` and `RawWorkItem` guarantee that the correct `Work` field
+//     will be used because of the ID const generic bound. This makes sure that `T::raw_get_work`
+//     uses the correct offset for the `Work` field, and `Work::new` picks the correct
+//     implementation of `WorkItemPointer` for `Arc<T>`.
 unsafe impl<T, const ID: u64> WorkItemPointer<ID> for Arc<T>
 where
     T: WorkItem<ID, Pointer = Self>,
@@ -537,7 +545,13 @@ where
     }
 }
 
-// SAFETY: TODO.
+// SAFETY: The `work_struct` raw pointer is guaranteed to be valid for the duration of the call to
+// the closure because we get it from an `Arc`, which means that the ref count will be at least 1,
+// and we don't drop the `Arc` ourselves. If `queue_work_on` returns true, it is further guaranteed
+// to be valid until a call to the function pointer in `work_struct` because we leak the memory it
+// points to, and only reclaim it if the closure returns false, or in `WorkItemPointer::run`, which
+// is what the function pointer in the `work_struct` must be pointing to, according to the safety
+// requirements of `WorkItemPointer`.
 unsafe impl<T, const ID: u64> RawWorkItem<ID> for Arc<T>
 where
     T: WorkItem<ID, Pointer = Self>,
diff --git a/scripts/mksysmap b/scripts/mksysmap
index c12723a04655..3accbdb269ac 100755
--- a/scripts/mksysmap
+++ b/scripts/mksysmap
@@ -26,7 +26,7 @@
 #  (do not forget a space before each pattern)
 
 # local symbols for ARM, MIPS, etc.
-/ \\$/d
+/ \$/d
 
 # local labels, .LBB, .Ltmpxxx, .L__unnamed_xx, .LASANPC, etc.
 / \.L/d
@@ -39,7 +39,7 @@
 / __pi_\.L/d
 
 # arm64 local symbols in non-VHE KVM namespace
-/ __kvm_nvhe_\\$/d
+/ __kvm_nvhe_\$/d
 / __kvm_nvhe_\.L/d
 
 # lld arm/aarch64/mips thunks
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 5b5745f00eb3..19ec72a69e90 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -132,7 +132,8 @@ struct devtable {
  * based at address m.
  */
 #define DEF_FIELD(m, devid, f) \
-	typeof(((struct devid *)0)->f) f = TO_NATIVE(*(typeof(f) *)((m) + OFF_##devid##_##f))
+	typeof(((struct devid *)0)->f) f = \
+		get_unaligned_native((typeof(f) *)((m) + OFF_##devid##_##f))
 
 /* Define a variable f that holds the address of field f of struct devid
  * based at address m.  Due to the way typeof works, for a field of type
@@ -600,7 +601,7 @@ static void do_pnp_card_entry(struct module *mod, void *symval)
 static void do_pcmcia_entry(struct module *mod, void *symval)
 {
 	char alias[256] = {};
-	unsigned int i;
+
 	DEF_FIELD(symval, pcmcia_device_id, match_flags);
 	DEF_FIELD(symval, pcmcia_device_id, manf_id);
 	DEF_FIELD(symval, pcmcia_device_id, card_id);
@@ -609,10 +610,6 @@ static void do_pcmcia_entry(struct module *mod, void *symval)
 	DEF_FIELD(symval, pcmcia_device_id, device_no);
 	DEF_FIELD_ADDR(symval, pcmcia_device_id, prod_id_hash);
 
-	for (i=0; i<4; i++) {
-		(*prod_id_hash)[i] = TO_NATIVE((*prod_id_hash)[i]);
-	}
-
 	ADD(alias, "m", match_flags & PCMCIA_DEV_ID_MATCH_MANF_ID,
 	    manf_id);
 	ADD(alias, "c", match_flags & PCMCIA_DEV_ID_MATCH_CARD_ID,
@@ -623,10 +620,14 @@ static void do_pcmcia_entry(struct module *mod, void *symval)
 	    function);
 	ADD(alias, "pfn", match_flags & PCMCIA_DEV_ID_MATCH_DEVICE_NO,
 	    device_no);
-	ADD(alias, "pa", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID1, (*prod_id_hash)[0]);
-	ADD(alias, "pb", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID2, (*prod_id_hash)[1]);
-	ADD(alias, "pc", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID3, (*prod_id_hash)[2]);
-	ADD(alias, "pd", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID4, (*prod_id_hash)[3]);
+	ADD(alias, "pa", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID1,
+	    get_unaligned_native(*prod_id_hash + 0));
+	ADD(alias, "pb", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID2,
+	    get_unaligned_native(*prod_id_hash + 1));
+	ADD(alias, "pc", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID3,
+	    get_unaligned_native(*prod_id_hash + 2));
+	ADD(alias, "pd", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID4,
+	    get_unaligned_native(*prod_id_hash + 3));
 
 	module_alias_printf(mod, true, "pcmcia:%s", alias);
 }
@@ -654,10 +655,9 @@ static void do_input(char *alias,
 {
 	unsigned int i;
 
-	for (i = min / BITS_PER_LONG; i < max / BITS_PER_LONG + 1; i++)
-		arr[i] = TO_NATIVE(arr[i]);
-	for (i = min; i < max; i++)
-		if (arr[i / BITS_PER_LONG] & (1ULL << (i%BITS_PER_LONG)))
+	for (i = min; i <= max; i++)
+		if (get_unaligned_native(arr + i / BITS_PER_LONG) &
+		    (1ULL << (i % BITS_PER_LONG)))
 			sprintf(alias + strlen(alias), "%X,*", i);
 }
 
@@ -812,15 +812,13 @@ static void do_virtio_entry(struct module *mod, void *symval)
  * Each byte of the guid will be represented by two hex characters
  * in the name.
  */
-
 static void do_vmbus_entry(struct module *mod, void *symval)
 {
-	int i;
 	DEF_FIELD_ADDR(symval, hv_vmbus_device_id, guid);
-	char guid_name[(sizeof(*guid) + 1) * 2];
+	char guid_name[sizeof(*guid) * 2 + 1];
 
-	for (i = 0; i < (sizeof(*guid) * 2); i += 2)
-		sprintf(&guid_name[i], "%02x", TO_NATIVE((guid->b)[i/2]));
+	for (int i = 0; i < sizeof(*guid); i++)
+		sprintf(&guid_name[i * 2], "%02x", guid->b[i]);
 
 	module_alias_printf(mod, false, "vmbus:%s", guid_name);
 }
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 94ee49207a45..7ea59dc4926b 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -1138,9 +1138,9 @@ static Elf_Addr addend_386_rel(uint32_t *location, unsigned int r_type)
 {
 	switch (r_type) {
 	case R_386_32:
-		return TO_NATIVE(*location);
+		return get_unaligned_native(location);
 	case R_386_PC32:
-		return TO_NATIVE(*location) + 4;
+		return get_unaligned_native(location) + 4;
 	}
 
 	return (Elf_Addr)(-1);
@@ -1161,24 +1161,24 @@ static Elf_Addr addend_arm_rel(void *loc, Elf_Sym *sym, unsigned int r_type)
 	switch (r_type) {
 	case R_ARM_ABS32:
 	case R_ARM_REL32:
-		inst = TO_NATIVE(*(uint32_t *)loc);
+		inst = get_unaligned_native((uint32_t *)loc);
 		return inst + sym->st_value;
 	case R_ARM_MOVW_ABS_NC:
 	case R_ARM_MOVT_ABS:
-		inst = TO_NATIVE(*(uint32_t *)loc);
+		inst = get_unaligned_native((uint32_t *)loc);
 		offset = sign_extend32(((inst & 0xf0000) >> 4) | (inst & 0xfff),
 				       15);
 		return offset + sym->st_value;
 	case R_ARM_PC24:
 	case R_ARM_CALL:
 	case R_ARM_JUMP24:
-		inst = TO_NATIVE(*(uint32_t *)loc);
+		inst = get_unaligned_native((uint32_t *)loc);
 		offset = sign_extend32((inst & 0x00ffffff) << 2, 25);
 		return offset + sym->st_value + 8;
 	case R_ARM_THM_MOVW_ABS_NC:
 	case R_ARM_THM_MOVT_ABS:
-		upper = TO_NATIVE(*(uint16_t *)loc);
-		lower = TO_NATIVE(*((uint16_t *)loc + 1));
+		upper = get_unaligned_native((uint16_t *)loc);
+		lower = get_unaligned_native((uint16_t *)loc + 1);
 		offset = sign_extend32(((upper & 0x000f) << 12) |
 				       ((upper & 0x0400) << 1) |
 				       ((lower & 0x7000) >> 4) |
@@ -1195,8 +1195,8 @@ static Elf_Addr addend_arm_rel(void *loc, Elf_Sym *sym, unsigned int r_type)
 		 * imm11 = lower[10:0]
 		 * imm32 = SignExtend(S:J2:J1:imm6:imm11:'0')
 		 */
-		upper = TO_NATIVE(*(uint16_t *)loc);
-		lower = TO_NATIVE(*((uint16_t *)loc + 1));
+		upper = get_unaligned_native((uint16_t *)loc);
+		lower = get_unaligned_native((uint16_t *)loc + 1);
 
 		sign = (upper >> 10) & 1;
 		j1 = (lower >> 13) & 1;
@@ -1219,8 +1219,8 @@ static Elf_Addr addend_arm_rel(void *loc, Elf_Sym *sym, unsigned int r_type)
 		 * I2    = NOT(J2 XOR S)
 		 * imm32 = SignExtend(S:I1:I2:imm10:imm11:'0')
 		 */
-		upper = TO_NATIVE(*(uint16_t *)loc);
-		lower = TO_NATIVE(*((uint16_t *)loc + 1));
+		upper = get_unaligned_native((uint16_t *)loc);
+		lower = get_unaligned_native((uint16_t *)loc + 1);
 
 		sign = (upper >> 10) & 1;
 		j1 = (lower >> 13) & 1;
@@ -1241,7 +1241,7 @@ static Elf_Addr addend_mips_rel(uint32_t *location, unsigned int r_type)
 {
 	uint32_t inst;
 
-	inst = TO_NATIVE(*location);
+	inst = get_unaligned_native(location);
 	switch (r_type) {
 	case R_MIPS_LO16:
 		return inst & 0xffff;
diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h
index 8b72c227ebf4..ffd0a52a606e 100644
--- a/scripts/mod/modpost.h
+++ b/scripts/mod/modpost.h
@@ -65,6 +65,20 @@
 #define TO_NATIVE(x)	\
 	(target_is_big_endian == host_is_big_endian ? x : bswap(x))
 
+#define __get_unaligned_t(type, ptr) ({					\
+	const struct { type x; } __attribute__((__packed__)) *__pptr =	\
+						(typeof(__pptr))(ptr);	\
+	__pptr->x;							\
+})
+
+#define get_unaligned(ptr)	__get_unaligned_t(typeof(*(ptr)), (ptr))
+
+#define get_unaligned_native(ptr) \
+({ \
+	typeof(*(ptr)) _val = get_unaligned(ptr); \
+	TO_NATIVE(_val); \
+})
+
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
 
 #define strstarts(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0)
diff --git a/scripts/package/PKGBUILD b/scripts/package/PKGBUILD
index f83493838cf9..dca706617adc 100644
--- a/scripts/package/PKGBUILD
+++ b/scripts/package/PKGBUILD
@@ -103,7 +103,7 @@ _package-headers() {
 
 _package-api-headers() {
 	pkgdesc="Kernel headers sanitized for use in userspace"
-	provides=(linux-api-headers)
+	provides=(linux-api-headers="${pkgver}")
 	conflicts=(linux-api-headers)
 
 	_prologue
diff --git a/scripts/sorttable.h b/scripts/sorttable.h
index 7bd0184380d3..a7c5445baf00 100644
--- a/scripts/sorttable.h
+++ b/scripts/sorttable.h
@@ -110,7 +110,7 @@ static inline unsigned long orc_ip(const int *ip)
 
 static int orc_sort_cmp(const void *_a, const void *_b)
 {
-	struct orc_entry *orc_a;
+	struct orc_entry *orc_a, *orc_b;
 	const int *a = g_orc_ip_table + *(int *)_a;
 	const int *b = g_orc_ip_table + *(int *)_b;
 	unsigned long a_val = orc_ip(a);
@@ -128,6 +128,9 @@ static int orc_sort_cmp(const void *_a, const void *_b)
 	 * whitelisted .o files which didn't get objtool generation.
 	 */
 	orc_a = g_orc_table + (a - g_orc_ip_table);
+	orc_b = g_orc_table + (b - g_orc_ip_table);
+	if (orc_a->type == ORC_TYPE_UNDEFINED && orc_b->type == ORC_TYPE_UNDEFINED)
+		return 0;
 	return orc_a->type == ORC_TYPE_UNDEFINED ? -1 : 1;
 }
 
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index cc0b0af20296..1f2680bcc43a 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -174,13 +174,15 @@ int avc_get_hash_stats(char *page)
  * using a linked list for extended_perms_decision lookup because the list is
  * always small. i.e. less than 5, typically 1
  */
-static struct extended_perms_decision *avc_xperms_decision_lookup(u8 driver,
-					struct avc_xperms_node *xp_node)
+static struct extended_perms_decision *
+avc_xperms_decision_lookup(u8 driver, u8 base_perm,
+			   struct avc_xperms_node *xp_node)
 {
 	struct avc_xperms_decision_node *xpd_node;
 
 	list_for_each_entry(xpd_node, &xp_node->xpd_head, xpd_list) {
-		if (xpd_node->xpd.driver == driver)
+		if (xpd_node->xpd.driver == driver &&
+		    xpd_node->xpd.base_perm == base_perm)
 			return &xpd_node->xpd;
 	}
 	return NULL;
@@ -205,11 +207,12 @@ avc_xperms_has_perm(struct extended_perms_decision *xpd,
 }
 
 static void avc_xperms_allow_perm(struct avc_xperms_node *xp_node,
-				u8 driver, u8 perm)
+				  u8 driver, u8 base_perm, u8 perm)
 {
 	struct extended_perms_decision *xpd;
 	security_xperm_set(xp_node->xp.drivers.p, driver);
-	xpd = avc_xperms_decision_lookup(driver, xp_node);
+	xp_node->xp.base_perms |= base_perm;
+	xpd = avc_xperms_decision_lookup(driver, base_perm, xp_node);
 	if (xpd && xpd->allowed)
 		security_xperm_set(xpd->allowed->p, perm);
 }
@@ -245,6 +248,7 @@ static void avc_xperms_free(struct avc_xperms_node *xp_node)
 static void avc_copy_xperms_decision(struct extended_perms_decision *dest,
 					struct extended_perms_decision *src)
 {
+	dest->base_perm = src->base_perm;
 	dest->driver = src->driver;
 	dest->used = src->used;
 	if (dest->used & XPERMS_ALLOWED)
@@ -272,6 +276,7 @@ static inline void avc_quick_copy_xperms_decision(u8 perm,
 	 */
 	u8 i = perm >> 5;
 
+	dest->base_perm = src->base_perm;
 	dest->used = src->used;
 	if (dest->used & XPERMS_ALLOWED)
 		dest->allowed->p[i] = src->allowed->p[i];
@@ -357,6 +362,7 @@ static int avc_xperms_populate(struct avc_node *node,
 
 	memcpy(dest->xp.drivers.p, src->xp.drivers.p, sizeof(dest->xp.drivers.p));
 	dest->xp.len = src->xp.len;
+	dest->xp.base_perms = src->xp.base_perms;
 
 	/* for each source xpd allocate a destination xpd and copy */
 	list_for_each_entry(src_xpd, &src->xpd_head, xpd_list) {
@@ -807,6 +813,7 @@ out:
  * @event : Updating event
  * @perms : Permission mask bits
  * @driver: xperm driver information
+ * @base_perm: the base permission associated with the extended permission
  * @xperm: xperm permissions
  * @ssid: AVC entry source sid
  * @tsid: AVC entry target sid
@@ -820,10 +827,9 @@ out:
  * otherwise, this function updates the AVC entry. The original AVC-entry object
  * will release later by RCU.
  */
-static int avc_update_node(u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid,
-			   u32 tsid, u16 tclass, u32 seqno,
-			   struct extended_perms_decision *xpd,
-			   u32 flags)
+static int avc_update_node(u32 event, u32 perms, u8 driver, u8 base_perm,
+			   u8 xperm, u32 ssid, u32 tsid, u16 tclass, u32 seqno,
+			   struct extended_perms_decision *xpd, u32 flags)
 {
 	u32 hvalue;
 	int rc = 0;
@@ -880,7 +886,7 @@ static int avc_update_node(u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid,
 	case AVC_CALLBACK_GRANT:
 		node->ae.avd.allowed |= perms;
 		if (node->ae.xp_node && (flags & AVC_EXTENDED_PERMS))
-			avc_xperms_allow_perm(node->ae.xp_node, driver, xperm);
+			avc_xperms_allow_perm(node->ae.xp_node, driver, base_perm, xperm);
 		break;
 	case AVC_CALLBACK_TRY_REVOKE:
 	case AVC_CALLBACK_REVOKE:
@@ -987,10 +993,9 @@ static noinline void avc_compute_av(u32 ssid, u32 tsid, u16 tclass,
 	avc_insert(ssid, tsid, tclass, avd, xp_node);
 }
 
-static noinline int avc_denied(u32 ssid, u32 tsid,
-			       u16 tclass, u32 requested,
-			       u8 driver, u8 xperm, unsigned int flags,
-			       struct av_decision *avd)
+static noinline int avc_denied(u32 ssid, u32 tsid, u16 tclass, u32 requested,
+			       u8 driver, u8 base_perm, u8 xperm,
+			       unsigned int flags, struct av_decision *avd)
 {
 	if (flags & AVC_STRICT)
 		return -EACCES;
@@ -999,7 +1004,7 @@ static noinline int avc_denied(u32 ssid, u32 tsid,
 	    !(avd->flags & AVD_FLAGS_PERMISSIVE))
 		return -EACCES;
 
-	avc_update_node(AVC_CALLBACK_GRANT, requested, driver,
+	avc_update_node(AVC_CALLBACK_GRANT, requested, driver, base_perm,
 			xperm, ssid, tsid, tclass, avd->seqno, NULL, flags);
 	return 0;
 }
@@ -1012,7 +1017,8 @@ static noinline int avc_denied(u32 ssid, u32 tsid,
  * driver field is used to specify which set contains the permission.
  */
 int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested,
-			   u8 driver, u8 xperm, struct common_audit_data *ad)
+			   u8 driver, u8 base_perm, u8 xperm,
+			   struct common_audit_data *ad)
 {
 	struct avc_node *node;
 	struct av_decision avd;
@@ -1047,22 +1053,23 @@ int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested,
 	local_xpd.auditallow = &auditallow;
 	local_xpd.dontaudit = &dontaudit;
 
-	xpd = avc_xperms_decision_lookup(driver, xp_node);
+	xpd = avc_xperms_decision_lookup(driver, base_perm, xp_node);
 	if (unlikely(!xpd)) {
 		/*
 		 * Compute the extended_perms_decision only if the driver
-		 * is flagged
+		 * is flagged and the base permission is known.
 		 */
-		if (!security_xperm_test(xp_node->xp.drivers.p, driver)) {
+		if (!security_xperm_test(xp_node->xp.drivers.p, driver) ||
+		    !(xp_node->xp.base_perms & base_perm)) {
 			avd.allowed &= ~requested;
 			goto decision;
 		}
 		rcu_read_unlock();
-		security_compute_xperms_decision(ssid, tsid, tclass,
-						 driver, &local_xpd);
+		security_compute_xperms_decision(ssid, tsid, tclass, driver,
+						 base_perm, &local_xpd);
 		rcu_read_lock();
-		avc_update_node(AVC_CALLBACK_ADD_XPERMS, requested,
-				driver, xperm, ssid, tsid, tclass, avd.seqno,
+		avc_update_node(AVC_CALLBACK_ADD_XPERMS, requested, driver,
+				base_perm, xperm, ssid, tsid, tclass, avd.seqno,
 				&local_xpd, 0);
 	} else {
 		avc_quick_copy_xperms_decision(xperm, &local_xpd, xpd);
@@ -1075,8 +1082,8 @@ int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested,
 decision:
 	denied = requested & ~(avd.allowed);
 	if (unlikely(denied))
-		rc = avc_denied(ssid, tsid, tclass, requested,
-				driver, xperm, AVC_EXTENDED_PERMS, &avd);
+		rc = avc_denied(ssid, tsid, tclass, requested, driver,
+				base_perm, xperm, AVC_EXTENDED_PERMS, &avd);
 
 	rcu_read_unlock();
 
@@ -1110,7 +1117,7 @@ static noinline int avc_perm_nonode(u32 ssid, u32 tsid, u16 tclass,
 	avc_compute_av(ssid, tsid, tclass, avd, &xp_node);
 	denied = requested & ~(avd->allowed);
 	if (unlikely(denied))
-		return avc_denied(ssid, tsid, tclass, requested, 0, 0,
+		return avc_denied(ssid, tsid, tclass, requested, 0, 0, 0,
 				  flags, avd);
 	return 0;
 }
@@ -1158,7 +1165,7 @@ inline int avc_has_perm_noaudit(u32 ssid, u32 tsid,
 	rcu_read_unlock();
 
 	if (unlikely(denied))
-		return avc_denied(ssid, tsid, tclass, requested, 0, 0,
+		return avc_denied(ssid, tsid, tclass, requested, 0, 0, 0,
 				  flags, avd);
 	return 0;
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 366c87a40bd1..171dd7fceac5 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3688,8 +3688,8 @@ static int ioctl_has_perm(const struct cred *cred, struct file *file,
 		return 0;
 
 	isec = inode_security(inode);
-	rc = avc_has_extended_perms(ssid, isec->sid, isec->sclass,
-				    requested, driver, xperm, &ad);
+	rc = avc_has_extended_perms(ssid, isec->sid, isec->sclass, requested,
+				    driver, AVC_EXT_IOCTL, xperm, &ad);
 out:
 	return rc;
 }
@@ -5952,7 +5952,7 @@ static int nlmsg_sock_has_extended_perms(struct sock *sk, u32 perms, u16 nlmsg_t
 	xperm = nlmsg_type & 0xff;
 
 	return avc_has_extended_perms(current_sid(), sksec->sid, sksec->sclass,
-			perms, driver, xperm, &ad);
+				      perms, driver, AVC_EXT_NLMSG, xperm, &ad);
 }
 
 static int selinux_netlink_send(struct sock *sk, struct sk_buff *skb)
diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h
index 96a614d47df8..281f40103663 100644
--- a/security/selinux/include/avc.h
+++ b/security/selinux/include/avc.h
@@ -136,8 +136,11 @@ int avc_has_perm_noaudit(u32 ssid, u32 tsid, u16 tclass, u32 requested,
 int avc_has_perm(u32 ssid, u32 tsid, u16 tclass, u32 requested,
 		 struct common_audit_data *auditdata);
 
+#define AVC_EXT_IOCTL	(1 << 0) /* Cache entry for an ioctl extended permission */
+#define AVC_EXT_NLMSG	(1 << 1) /* Cache entry for an nlmsg extended permission */
 int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested,
-			   u8 driver, u8 perm, struct common_audit_data *ad);
+			   u8 driver, u8 base_perm, u8 perm,
+			   struct common_audit_data *ad);
 
 u32 avc_policy_seqno(void);
 
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index c7f2731abd03..700bd6c8bb38 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -239,6 +239,7 @@ struct extended_perms_data {
 struct extended_perms_decision {
 	u8 used;
 	u8 driver;
+	u8 base_perm;
 	struct extended_perms_data *allowed;
 	struct extended_perms_data *auditallow;
 	struct extended_perms_data *dontaudit;
@@ -246,6 +247,7 @@ struct extended_perms_decision {
 
 struct extended_perms {
 	u16 len; /* length associated decision chain */
+	u8 base_perms; /* which base permissions are covered */
 	struct extended_perms_data drivers; /* flag drivers that are used */
 };
 
@@ -257,6 +259,7 @@ void security_compute_av(u32 ssid, u32 tsid, u16 tclass,
 			 struct extended_perms *xperms);
 
 void security_compute_xperms_decision(u32 ssid, u32 tsid, u16 tclass, u8 driver,
+				      u8 base_perm,
 				      struct extended_perms_decision *xpermd);
 
 void security_compute_av_user(u32 ssid, u32 tsid, u16 tclass,
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 3d5c563cfc4c..d9f58b5d0f49 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -582,7 +582,7 @@ static void type_attribute_bounds_av(struct policydb *policydb,
 }
 
 /*
- * Flag which drivers have permissions.
+ * Flag which drivers have permissions and which base permissions are covered.
  */
 void services_compute_xperms_drivers(
 		struct extended_perms *xperms,
@@ -592,12 +592,19 @@ void services_compute_xperms_drivers(
 
 	switch (node->datum.u.xperms->specified) {
 	case AVTAB_XPERMS_IOCTLDRIVER:
+		xperms->base_perms |= AVC_EXT_IOCTL;
 		/* if one or more driver has all permissions allowed */
 		for (i = 0; i < ARRAY_SIZE(xperms->drivers.p); i++)
 			xperms->drivers.p[i] |= node->datum.u.xperms->perms.p[i];
 		break;
 	case AVTAB_XPERMS_IOCTLFUNCTION:
+		xperms->base_perms |= AVC_EXT_IOCTL;
+		/* if allowing permissions within a driver */
+		security_xperm_set(xperms->drivers.p,
+					node->datum.u.xperms->driver);
+		break;
 	case AVTAB_XPERMS_NLMSG:
+		xperms->base_perms |= AVC_EXT_NLMSG;
 		/* if allowing permissions within a driver */
 		security_xperm_set(xperms->drivers.p,
 					node->datum.u.xperms->driver);
@@ -631,8 +638,7 @@ static void context_struct_compute_av(struct policydb *policydb,
 	avd->auditallow = 0;
 	avd->auditdeny = 0xffffffff;
 	if (xperms) {
-		memset(&xperms->drivers, 0, sizeof(xperms->drivers));
-		xperms->len = 0;
+		memset(xperms, 0, sizeof(*xperms));
 	}
 
 	if (unlikely(!tclass || tclass > policydb->p_classes.nprim)) {
@@ -969,13 +975,19 @@ void services_compute_xperms_decision(struct extended_perms_decision *xpermd,
 {
 	switch (node->datum.u.xperms->specified) {
 	case AVTAB_XPERMS_IOCTLFUNCTION:
-	case AVTAB_XPERMS_NLMSG:
-		if (xpermd->driver != node->datum.u.xperms->driver)
+		if (xpermd->base_perm != AVC_EXT_IOCTL ||
+		    xpermd->driver != node->datum.u.xperms->driver)
 			return;
 		break;
 	case AVTAB_XPERMS_IOCTLDRIVER:
-		if (!security_xperm_test(node->datum.u.xperms->perms.p,
-					xpermd->driver))
+		if (xpermd->base_perm != AVC_EXT_IOCTL ||
+		    !security_xperm_test(node->datum.u.xperms->perms.p,
+					 xpermd->driver))
+			return;
+		break;
+	case AVTAB_XPERMS_NLMSG:
+		if (xpermd->base_perm != AVC_EXT_NLMSG ||
+		    xpermd->driver != node->datum.u.xperms->driver)
 			return;
 		break;
 	default:
@@ -1010,6 +1022,7 @@ void security_compute_xperms_decision(u32 ssid,
 				      u32 tsid,
 				      u16 orig_tclass,
 				      u8 driver,
+				      u8 base_perm,
 				      struct extended_perms_decision *xpermd)
 {
 	struct selinux_policy *policy;
@@ -1023,6 +1036,7 @@ void security_compute_xperms_decision(u32 ssid,
 	struct ebitmap_node *snode, *tnode;
 	unsigned int i, j;
 
+	xpermd->base_perm = base_perm;
 	xpermd->driver = driver;
 	xpermd->used = 0;
 	memset(xpermd->allowed->p, 0, sizeof(xpermd->allowed->p));
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index 8516c1ccd57a..7e46ca4cd31b 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -1315,6 +1315,8 @@ enum {
 	IFLA_NETKIT_MODE,
 	IFLA_NETKIT_SCRUB,
 	IFLA_NETKIT_PEER_SCRUB,
+	IFLA_NETKIT_HEADROOM,
+	IFLA_NETKIT_TAILROOM,
 	__IFLA_NETKIT_MAX,
 };
 #define IFLA_NETKIT_MAX	(__IFLA_NETKIT_MAX - 1)
diff --git a/tools/net/ynl/Makefile b/tools/net/ynl/Makefile
index d1cdf2a8f826..211df5a93ad9 100644
--- a/tools/net/ynl/Makefile
+++ b/tools/net/ynl/Makefile
@@ -1,5 +1,17 @@
 # SPDX-License-Identifier: GPL-2.0
 
+include ../../scripts/Makefile.arch
+
+INSTALL	?= install
+prefix  ?= /usr
+ifeq ($(LP64), 1)
+  libdir_relative = lib64
+else
+  libdir_relative = lib
+endif
+libdir  ?= $(prefix)/$(libdir_relative)
+includedir ?= $(prefix)/include
+
 SUBDIRS = lib generated samples
 
 all: $(SUBDIRS) libynl.a
@@ -21,5 +33,20 @@ clean distclean:
 		fi \
 	done
 	rm -f libynl.a
+	rm -rf pyynl/__pycache__
+	rm -rf pyynl/lib/__pycache__
+	rm -rf pyynl.egg-info
+	rm -rf build
+
+install: libynl.a lib/*.h
+	@echo -e "\tINSTALL libynl.a"
+	@$(INSTALL) -d $(DESTDIR)$(libdir)
+	@$(INSTALL) -m 0644 libynl.a $(DESTDIR)$(libdir)/libynl.a
+	@echo -e "\tINSTALL libynl headers"
+	@$(INSTALL) -d $(DESTDIR)$(includedir)/ynl
+	@$(INSTALL) -m 0644 lib/*.h $(DESTDIR)$(includedir)/ynl/
+	@echo -e "\tINSTALL pyynl"
+	@pip install --prefix=$(DESTDIR)$(prefix) .
+	@make -C generated install
 
-.PHONY: all clean distclean $(SUBDIRS)
+.PHONY: all clean distclean install $(SUBDIRS)
diff --git a/tools/net/ynl/generated/.gitignore b/tools/net/ynl/generated/.gitignore
index ade488626d26..859a6fb446e1 100644
--- a/tools/net/ynl/generated/.gitignore
+++ b/tools/net/ynl/generated/.gitignore
@@ -1,2 +1,3 @@
 *-user.c
 *-user.h
+*.rst
diff --git a/tools/net/ynl/generated/Makefile b/tools/net/ynl/generated/Makefile
index 7db5240de58a..21f9e299dc75 100644
--- a/tools/net/ynl/generated/Makefile
+++ b/tools/net/ynl/generated/Makefile
@@ -7,32 +7,44 @@ ifeq ("$(DEBUG)","1")
   CFLAGS += -g -fsanitize=address -fsanitize=leak -static-libasan
 endif
 
+INSTALL     ?= install
+prefix      ?= /usr
+datarootdir ?= $(prefix)/share
+docdir      ?= $(datarootdir)/doc
+includedir  ?= $(prefix)/include
+
 include ../Makefile.deps
 
 YNL_GEN_ARG_ethtool:=--user-header linux/ethtool_netlink.h \
 	--exclude-op stats-get
 
-TOOL:=../ynl-gen-c.py
+TOOL:=../pyynl/ynl_gen_c.py
+TOOL_RST:=../pyynl/ynl_gen_rst.py
 
+SPECS_DIR:=../../../../Documentation/netlink/specs
 GENS_PATHS=$(shell grep -nrI --files-without-match \
 		'protocol: netlink' \
-		../../../../Documentation/netlink/specs/)
-GENS=$(patsubst ../../../../Documentation/netlink/specs/%.yaml,%,${GENS_PATHS})
+		$(SPECS_DIR))
+GENS=$(patsubst $(SPECS_DIR)/%.yaml,%,${GENS_PATHS})
 SRCS=$(patsubst %,%-user.c,${GENS})
 HDRS=$(patsubst %,%-user.h,${GENS})
 OBJS=$(patsubst %,%-user.o,${GENS})
 
-all: protos.a $(HDRS) $(SRCS) $(KHDRS) $(KSRCS) $(UAPI)
+SPECS_PATHS=$(wildcard $(SPECS_DIR)/*.yaml)
+SPECS=$(patsubst $(SPECS_DIR)/%.yaml,%,${SPECS_PATHS})
+RSTS=$(patsubst %,%.rst,${SPECS})
+
+all: protos.a $(HDRS) $(SRCS) $(KHDRS) $(KSRCS) $(UAPI) $(RSTS)
 
 protos.a: $(OBJS)
 	@echo -e "\tAR $@"
 	@ar rcs $@ $(OBJS)
 
-%-user.h: ../../../../Documentation/netlink/specs/%.yaml $(TOOL)
+%-user.h: $(SPECS_DIR)/%.yaml $(TOOL)
 	@echo -e "\tGEN $@"
 	@$(TOOL) --mode user --header --spec $< -o $@ $(YNL_GEN_ARG_$*)
 
-%-user.c: ../../../../Documentation/netlink/specs/%.yaml $(TOOL)
+%-user.c: $(SPECS_DIR)/%.yaml $(TOOL)
 	@echo -e "\tGEN $@"
 	@$(TOOL) --mode user --source --spec $< -o $@ $(YNL_GEN_ARG_$*)
 
@@ -40,14 +52,37 @@ protos.a: $(OBJS)
 	@echo -e "\tCC $@"
 	@$(COMPILE.c) $(CFLAGS_$*) -o $@ $<
 
+%.rst: $(SPECS_DIR)/%.yaml $(TOOL_RST)
+	@echo -e "\tGEN_RST $@"
+	@$(TOOL_RST) -o $@ -i $<
+
 clean:
 	rm -f *.o
 
 distclean: clean
-	rm -f *.c *.h *.a
+	rm -f *.c *.h *.a *.rst
 
 regen:
 	@../ynl-regen.sh
 
-.PHONY: all clean distclean regen
+install-headers: $(HDRS)
+	@echo -e "\tINSTALL generated headers"
+	@$(INSTALL) -d $(DESTDIR)$(includedir)/ynl
+	@$(INSTALL) -m 0644 *.h $(DESTDIR)$(includedir)/ynl/
+
+install-rsts: $(RSTS)
+	@echo -e "\tINSTALL generated docs"
+	@$(INSTALL) -d $(DESTDIR)$(docdir)/ynl
+	@$(INSTALL) -m 0644 $(RSTS) $(DESTDIR)$(docdir)/ynl/
+
+install-specs:
+	@echo -e "\tINSTALL specs"
+	@$(INSTALL) -d $(DESTDIR)$(datarootdir)/ynl
+	@$(INSTALL) -m 0644 ../../../../Documentation/netlink/*.yaml $(DESTDIR)$(datarootdir)/ynl/
+	@$(INSTALL) -d $(DESTDIR)$(datarootdir)/ynl/specs
+	@$(INSTALL) -m 0644 $(SPECS_DIR)/*.yaml $(DESTDIR)$(datarootdir)/ynl/specs/
+
+install: install-headers install-rsts install-specs
+
+.PHONY: all clean distclean regen install install-headers install-rsts install-specs
 .DEFAULT_GOAL: all
diff --git a/tools/net/ynl/lib/.gitignore b/tools/net/ynl/lib/.gitignore
index 296c4035dbf2..a4383358ec72 100644
--- a/tools/net/ynl/lib/.gitignore
+++ b/tools/net/ynl/lib/.gitignore
@@ -1,2 +1 @@
-__pycache__/
 *.d
diff --git a/tools/net/ynl/lib/Makefile b/tools/net/ynl/lib/Makefile
index 94c49cca3dca..4b2b98704ff9 100644
--- a/tools/net/ynl/lib/Makefile
+++ b/tools/net/ynl/lib/Makefile
@@ -19,7 +19,6 @@ ynl.a: $(OBJS)
 
 clean:
 	rm -f *.o *.d *~
-	rm -rf __pycache__
 
 distclean: clean
 	rm -f *.a
diff --git a/tools/net/ynl/pyproject.toml b/tools/net/ynl/pyproject.toml
new file mode 100644
index 000000000000..a81d8779b0e0
--- /dev/null
+++ b/tools/net/ynl/pyproject.toml
@@ -0,0 +1,24 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "pyynl"
+authors = [
+    {name = "Donald Hunter", email = "donald.hunter@gmail.com"},
+    {name = "Jakub Kicinski", email = "kuba@kernel.org"},
+]
+description = "yaml netlink (ynl)"
+version = "0.0.1"
+requires-python = ">=3.9"
+dependencies = [
+    "pyyaml==6.*",
+    "jsonschema==4.*"
+]
+
+[tool.setuptools.packages.find]
+include = ["pyynl", "pyynl.lib"]
+
+[project.scripts]
+ynl = "pyynl.cli:main"
+ynl-ethtool = "pyynl.ethtool:main"
diff --git a/tools/net/ynl/pyynl/.gitignore b/tools/net/ynl/pyynl/.gitignore
new file mode 100644
index 000000000000..b801cd2d016e
--- /dev/null
+++ b/tools/net/ynl/pyynl/.gitignore
@@ -0,0 +1,2 @@
+__pycache__/
+lib/__pycache__/
diff --git a/tools/net/ynl/pyynl/__init__.py b/tools/net/ynl/pyynl/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/tools/net/ynl/pyynl/__init__.py
diff --git a/tools/net/ynl/cli.py b/tools/net/ynl/pyynl/cli.py
index 41d9fa5c818d..794e3c7dcc65 100755
--- a/tools/net/ynl/cli.py
+++ b/tools/net/ynl/pyynl/cli.py
@@ -3,6 +3,7 @@
 
 import argparse
 import json
+import os
 import pathlib
 import pprint
 import sys
@@ -10,6 +11,24 @@ import sys
 sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix())
 from lib import YnlFamily, Netlink, NlError
 
+sys_schema_dir='/usr/share/ynl'
+relative_schema_dir='../../../../Documentation/netlink'
+
+def schema_dir():
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    schema_dir = os.path.abspath(f"{script_dir}/{relative_schema_dir}")
+    if not os.path.isdir(schema_dir):
+        schema_dir = sys_schema_dir
+    if not os.path.isdir(schema_dir):
+        raise Exception(f"Schema directory {schema_dir} does not exist")
+    return schema_dir
+
+def spec_dir():
+    spec_dir = schema_dir() + '/specs'
+    if not os.path.isdir(spec_dir):
+        raise Exception(f"Spec directory {spec_dir} does not exist")
+    return spec_dir
+
 
 class YnlEncoder(json.JSONEncoder):
     def default(self, obj):
@@ -32,7 +51,14 @@ def main():
 
     parser = argparse.ArgumentParser(description=description,
                                      epilog=epilog)
-    parser.add_argument('--spec', dest='spec', type=str, required=True)
+    spec_group = parser.add_mutually_exclusive_group(required=True)
+    spec_group.add_argument('--family', dest='family', type=str,
+                            help='name of the netlink FAMILY')
+    spec_group.add_argument('--list-families', action='store_true',
+                            help='list all netlink families supported by YNL (has spec)')
+    spec_group.add_argument('--spec', dest='spec', type=str,
+                            help='choose the family by SPEC file path')
+
     parser.add_argument('--schema', dest='schema', type=str)
     parser.add_argument('--no-schema', action='store_true')
     parser.add_argument('--json', dest='json_text', type=str)
@@ -70,6 +96,12 @@ def main():
         else:
             pprint.PrettyPrinter().pprint(msg)
 
+    if args.list_families:
+        for filename in sorted(os.listdir(spec_dir())):
+            if filename.endswith('.yaml'):
+                print(filename.removesuffix('.yaml'))
+        return
+
     if args.no_schema:
         args.schema = ''
 
@@ -77,7 +109,16 @@ def main():
     if args.json_text:
         attrs = json.loads(args.json_text)
 
-    ynl = YnlFamily(args.spec, args.schema, args.process_unknown,
+    if args.family:
+        spec = f"{spec_dir()}/{args.family}.yaml"
+        if args.schema is None and spec.startswith(sys_schema_dir):
+            args.schema = '' # disable schema validation when installed
+    else:
+        spec = args.spec
+    if not os.path.isfile(spec):
+        raise Exception(f"Spec file {spec} does not exist")
+
+    ynl = YnlFamily(spec, args.schema, args.process_unknown,
                     recv_size=args.dbg_small_recv)
     if args.dbg_small_recv:
         ynl.set_recv_dbg(True)
diff --git a/tools/net/ynl/ethtool.py b/tools/net/ynl/pyynl/ethtool.py
index ebb0a11f67bf..af7fddd7b085 100755
--- a/tools/net/ynl/ethtool.py
+++ b/tools/net/ynl/pyynl/ethtool.py
@@ -11,6 +11,7 @@ import os
 
 sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix())
 from lib import YnlFamily
+from cli import schema_dir, spec_dir
 
 def args_to_req(ynl, op_name, args, req):
     """
@@ -156,10 +157,8 @@ def main():
     args = parser.parse_args()
 
     script_abs_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
-    spec = os.path.join(script_abs_dir,
-                        '../../../Documentation/netlink/specs/ethtool.yaml')
-    schema = os.path.join(script_abs_dir,
-                          '../../../Documentation/netlink/genetlink-legacy.yaml')
+    spec = os.path.join(spec_dir(), 'ethtool.yaml')
+    schema = os.path.join(schema_dir(), 'genetlink-legacy.yaml')
 
     ynl = YnlFamily(spec, schema)
 
diff --git a/tools/net/ynl/lib/__init__.py b/tools/net/ynl/pyynl/lib/__init__.py
index 9137b83e580a..9137b83e580a 100644
--- a/tools/net/ynl/lib/__init__.py
+++ b/tools/net/ynl/pyynl/lib/__init__.py
diff --git a/tools/net/ynl/lib/nlspec.py b/tools/net/ynl/pyynl/lib/nlspec.py
index a745739655ad..314ec8007496 100644
--- a/tools/net/ynl/lib/nlspec.py
+++ b/tools/net/ynl/pyynl/lib/nlspec.py
@@ -219,7 +219,10 @@ class SpecAttrSet(SpecElement):
         else:
             real_set = family.attr_sets[self.subset_of]
             for elem in self.yaml['attributes']:
-                attr = real_set[elem['name']]
+                real_attr = real_set[elem['name']]
+                combined_elem = real_attr.yaml | elem
+                attr = self.new_attr(combined_elem, real_attr.value)
+
                 self.attrs[attr.name] = attr
                 self.attrs_by_val[attr.value] = attr
 
diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/pyynl/lib/ynl.py
index eea29359a899..08f8bf89cfc2 100644
--- a/tools/net/ynl/lib/ynl.py
+++ b/tools/net/ynl/pyynl/lib/ynl.py
@@ -733,41 +733,45 @@ class YnlFamily(SpecFamily):
                 self._rsp_add(rsp, attr_name, None, self._decode_unknown(attr))
                 continue
 
-            if attr_spec["type"] == 'nest':
-                subdict = self._decode(NlAttrs(attr.raw), attr_spec['nested-attributes'], search_attrs)
-                decoded = subdict
-            elif attr_spec["type"] == 'string':
-                decoded = attr.as_strz()
-            elif attr_spec["type"] == 'binary':
-                decoded = self._decode_binary(attr, attr_spec)
-            elif attr_spec["type"] == 'flag':
-                decoded = True
-            elif attr_spec.is_auto_scalar:
-                decoded = attr.as_auto_scalar(attr_spec['type'], attr_spec.byte_order)
-            elif attr_spec["type"] in NlAttr.type_formats:
-                decoded = attr.as_scalar(attr_spec['type'], attr_spec.byte_order)
-                if 'enum' in attr_spec:
-                    decoded = self._decode_enum(decoded, attr_spec)
-                elif attr_spec.display_hint:
-                    decoded = self._formatted_string(decoded, attr_spec.display_hint)
-            elif attr_spec["type"] == 'indexed-array':
-                decoded = self._decode_array_attr(attr, attr_spec)
-            elif attr_spec["type"] == 'bitfield32':
-                value, selector = struct.unpack("II", attr.raw)
-                if 'enum' in attr_spec:
-                    value = self._decode_enum(value, attr_spec)
-                    selector = self._decode_enum(selector, attr_spec)
-                decoded = {"value": value, "selector": selector}
-            elif attr_spec["type"] == 'sub-message':
-                decoded = self._decode_sub_msg(attr, attr_spec, search_attrs)
-            elif attr_spec["type"] == 'nest-type-value':
-                decoded = self._decode_nest_type_value(attr, attr_spec)
-            else:
-                if not self.process_unknown:
-                    raise Exception(f'Unknown {attr_spec["type"]} with name {attr_spec["name"]}')
-                decoded = self._decode_unknown(attr)
-
-            self._rsp_add(rsp, attr_spec["name"], attr_spec.is_multi, decoded)
+            try:
+                if attr_spec["type"] == 'nest':
+                    subdict = self._decode(NlAttrs(attr.raw), attr_spec['nested-attributes'], search_attrs)
+                    decoded = subdict
+                elif attr_spec["type"] == 'string':
+                    decoded = attr.as_strz()
+                elif attr_spec["type"] == 'binary':
+                    decoded = self._decode_binary(attr, attr_spec)
+                elif attr_spec["type"] == 'flag':
+                    decoded = True
+                elif attr_spec.is_auto_scalar:
+                    decoded = attr.as_auto_scalar(attr_spec['type'], attr_spec.byte_order)
+                elif attr_spec["type"] in NlAttr.type_formats:
+                    decoded = attr.as_scalar(attr_spec['type'], attr_spec.byte_order)
+                    if 'enum' in attr_spec:
+                        decoded = self._decode_enum(decoded, attr_spec)
+                    elif attr_spec.display_hint:
+                        decoded = self._formatted_string(decoded, attr_spec.display_hint)
+                elif attr_spec["type"] == 'indexed-array':
+                    decoded = self._decode_array_attr(attr, attr_spec)
+                elif attr_spec["type"] == 'bitfield32':
+                    value, selector = struct.unpack("II", attr.raw)
+                    if 'enum' in attr_spec:
+                        value = self._decode_enum(value, attr_spec)
+                        selector = self._decode_enum(selector, attr_spec)
+                    decoded = {"value": value, "selector": selector}
+                elif attr_spec["type"] == 'sub-message':
+                    decoded = self._decode_sub_msg(attr, attr_spec, search_attrs)
+                elif attr_spec["type"] == 'nest-type-value':
+                    decoded = self._decode_nest_type_value(attr, attr_spec)
+                else:
+                    if not self.process_unknown:
+                        raise Exception(f'Unknown {attr_spec["type"]} with name {attr_spec["name"]}')
+                    decoded = self._decode_unknown(attr)
+
+                self._rsp_add(rsp, attr_spec["name"], attr_spec.is_multi, decoded)
+            except:
+                print(f"Error decoding '{attr_spec.name}' from '{space}'")
+                raise
 
         return rsp
 
diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/pyynl/ynl_gen_c.py
index ec2288948795..d3a7dfbcf929 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/pyynl/ynl_gen_c.py
@@ -79,6 +79,20 @@ class Type(SpecAttr):
         self.enum_name = None
         delattr(self, "enum_name")
 
+    def _get_real_attr(self):
+        # if the attr is for a subset return the "real" attr (just one down, does not recurse)
+        return self.family.attr_sets[self.attr_set.subset_of][self.name]
+
+    def set_request(self):
+        self.request = True
+        if self.attr_set.subset_of:
+            self._get_real_attr().set_request()
+
+    def set_reply(self):
+        self.reply = True
+        if self.attr_set.subset_of:
+            self._get_real_attr().set_reply()
+
     def get_limit(self, limit, default=None):
         value = self.checks.get(limit, default)
         if value is None:
@@ -106,6 +120,10 @@ class Type(SpecAttr):
             enum_name = f"{self.attr_set.name_prefix}{self.name}"
         self.enum_name = c_upper(enum_name)
 
+        if self.attr_set.subset_of:
+            if self.checks != self._get_real_attr().checks:
+                raise Exception("Overriding checks not supported by codegen, yet")
+
     def is_multi_val(self):
         return None
 
@@ -1119,17 +1137,17 @@ class Family(SpecFamily):
         for _, struct in self.pure_nested_structs.items():
             if struct.request:
                 for _, arg in struct.member_list():
-                    arg.request = True
+                    arg.set_request()
             if struct.reply:
                 for _, arg in struct.member_list():
-                    arg.reply = True
+                    arg.set_reply()
 
         for root_set, rs_members in self.root_sets.items():
             for attr, spec in self.attr_sets[root_set].items():
                 if attr in rs_members['request']:
-                    spec.request = True
+                    spec.set_request()
                 if attr in rs_members['reply']:
-                    spec.reply = True
+                    spec.set_reply()
 
     def _load_global_policy(self):
         global_set = set()
@@ -1765,7 +1783,14 @@ def parse_rsp_nested(ri, struct):
                   f'{struct.ptr_name}dst = yarg->data;']
     init_lines = []
 
-    _multi_parse(ri, struct, init_lines, local_vars)
+    if struct.member_list():
+        _multi_parse(ri, struct, init_lines, local_vars)
+    else:
+        # Empty nest
+        ri.cw.block_start()
+        ri.cw.p('return 0;')
+        ri.cw.block_end()
+        ri.cw.nl()
 
 
 def parse_rsp_msg(ri, deref=False):
@@ -2592,7 +2617,8 @@ def render_uapi(family, cw):
                 val = attr.value
             val += 1
             cw.p(attr.enum_name + suffix)
-        cw.nl()
+        if attr_set.items():
+            cw.nl()
         cw.p(attr_set.cnt_name + ('' if max_by_define else ','))
         if not max_by_define:
             cw.p(f"{attr_set.max_name} = {max_value}")
diff --git a/tools/net/ynl/ynl-gen-rst.py b/tools/net/ynl/pyynl/ynl_gen_rst.py
index 6c56d0d726b4..6c56d0d726b4 100755
--- a/tools/net/ynl/ynl-gen-rst.py
+++ b/tools/net/ynl/pyynl/ynl_gen_rst.py
diff --git a/tools/net/ynl/ynl-regen.sh b/tools/net/ynl/ynl-regen.sh
index a37304dcc88e..81b4ecd89100 100755
--- a/tools/net/ynl/ynl-regen.sh
+++ b/tools/net/ynl/ynl-regen.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 
-TOOL=$(dirname $(realpath $0))/ynl-gen-c.py
+TOOL=$(dirname $(realpath $0))/pyynl/ynl_gen_c.py
 
 force=
 search=
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 2f36b7b6418d..625f5b046776 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -40,9 +40,9 @@ void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_fl
 void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
 u32 scx_bpf_dispatch_nr_slots(void) __ksym;
 void scx_bpf_dispatch_cancel(void) __ksym;
-bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym;
-void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym;
-void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym;
+bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym __weak;
+void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
+void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
 bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
 bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
 u32 scx_bpf_reenqueue_local(void) __ksym;
diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index 21deea320bd7..e938156ed0a0 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -97,7 +97,7 @@ restart:
 	SCX_BUG_ON(!cpuset, "Failed to allocate cpuset");
 	CPU_ZERO(cpuset);
 	CPU_SET(skel->rodata->central_cpu, cpuset);
-	SCX_BUG_ON(sched_setaffinity(0, sizeof(cpuset), cpuset),
+	SCX_BUG_ON(sched_setaffinity(0, sizeof(*cpuset), cpuset),
 		   "Failed to affinitize to central CPU %d (max %d)",
 		   skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1);
 	CPU_FREE(cpuset);
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 7eeb3cbe18c7..0a016cd71cba 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -129,7 +129,6 @@ TEST_FILES = xsk_prereqs.sh $(wildcard progs/btf_dump_test_case_*.c)
 TEST_PROGS := test_kmod.sh \
 	test_xdp_redirect.sh \
 	test_xdp_redirect_multi.sh \
-	test_xdp_meta.sh \
 	test_tunnel.sh \
 	test_lwt_seg6local.sh \
 	test_lirc_mode2.sh \
diff --git a/tools/testing/selftests/bpf/prog_tests/tc_netkit.c b/tools/testing/selftests/bpf/prog_tests/tc_netkit.c
index 151a4210028f..2461d183dee5 100644
--- a/tools/testing/selftests/bpf/prog_tests/tc_netkit.c
+++ b/tools/testing/selftests/bpf/prog_tests/tc_netkit.c
@@ -14,10 +14,16 @@
 #include "netlink_helpers.h"
 #include "tc_helpers.h"
 
+#define NETKIT_HEADROOM	32
+#define NETKIT_TAILROOM	8
+
 #define MARK		42
 #define PRIO		0xeb9f
 #define ICMP_ECHO	8
 
+#define FLAG_ADJUST_ROOM (1 << 0)
+#define FLAG_SAME_NETNS  (1 << 1)
+
 struct icmphdr {
 	__u8		type;
 	__u8		code;
@@ -35,7 +41,7 @@ struct iplink_req {
 };
 
 static int create_netkit(int mode, int policy, int peer_policy, int *ifindex,
-			 bool same_netns, int scrub, int peer_scrub)
+			 int scrub, int peer_scrub, __u32 flags)
 {
 	struct rtnl_handle rth = { .fd = -1 };
 	struct iplink_req req = {};
@@ -63,6 +69,10 @@ static int create_netkit(int mode, int policy, int peer_policy, int *ifindex,
 	addattr32(&req.n, sizeof(req), IFLA_NETKIT_SCRUB, scrub);
 	addattr32(&req.n, sizeof(req), IFLA_NETKIT_PEER_SCRUB, peer_scrub);
 	addattr32(&req.n, sizeof(req), IFLA_NETKIT_MODE, mode);
+	if (flags & FLAG_ADJUST_ROOM) {
+		addattr16(&req.n, sizeof(req), IFLA_NETKIT_HEADROOM, NETKIT_HEADROOM);
+		addattr16(&req.n, sizeof(req), IFLA_NETKIT_TAILROOM, NETKIT_TAILROOM);
+	}
 	addattr_nest_end(&req.n, data);
 	addattr_nest_end(&req.n, linkinfo);
 
@@ -87,7 +97,7 @@ static int create_netkit(int mode, int policy, int peer_policy, int *ifindex,
 				 " addr ee:ff:bb:cc:aa:dd"),
 				 "set hwaddress");
 	}
-	if (same_netns) {
+	if (flags & FLAG_SAME_NETNS) {
 		ASSERT_OK(system("ip link set dev " netkit_peer " up"),
 				 "up peer");
 		ASSERT_OK(system("ip addr add dev " netkit_peer " 10.0.0.2/24"),
@@ -184,8 +194,8 @@ void serial_test_tc_netkit_basic(void)
 	int err, ifindex;
 
 	err = create_netkit(NETKIT_L2, NETKIT_PASS, NETKIT_PASS,
-			    &ifindex, false, NETKIT_SCRUB_DEFAULT,
-			    NETKIT_SCRUB_DEFAULT);
+			    &ifindex, NETKIT_SCRUB_DEFAULT,
+			    NETKIT_SCRUB_DEFAULT, 0);
 	if (err)
 		return;
 
@@ -299,8 +309,8 @@ static void serial_test_tc_netkit_multi_links_target(int mode, int target)
 	int err, ifindex;
 
 	err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS,
-			    &ifindex, false, NETKIT_SCRUB_DEFAULT,
-			    NETKIT_SCRUB_DEFAULT);
+			    &ifindex, NETKIT_SCRUB_DEFAULT,
+			    NETKIT_SCRUB_DEFAULT, 0);
 	if (err)
 		return;
 
@@ -428,8 +438,8 @@ static void serial_test_tc_netkit_multi_opts_target(int mode, int target)
 	int err, ifindex;
 
 	err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS,
-			    &ifindex, false, NETKIT_SCRUB_DEFAULT,
-			    NETKIT_SCRUB_DEFAULT);
+			    &ifindex, NETKIT_SCRUB_DEFAULT,
+			    NETKIT_SCRUB_DEFAULT, 0);
 	if (err)
 		return;
 
@@ -543,8 +553,8 @@ void serial_test_tc_netkit_device(void)
 	int err, ifindex, ifindex2;
 
 	err = create_netkit(NETKIT_L3, NETKIT_PASS, NETKIT_PASS,
-			    &ifindex, true, NETKIT_SCRUB_DEFAULT,
-			    NETKIT_SCRUB_DEFAULT);
+			    &ifindex, NETKIT_SCRUB_DEFAULT,
+			    NETKIT_SCRUB_DEFAULT, FLAG_SAME_NETNS);
 	if (err)
 		return;
 
@@ -655,8 +665,8 @@ static void serial_test_tc_netkit_neigh_links_target(int mode, int target)
 	int err, ifindex;
 
 	err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS,
-			    &ifindex, false, NETKIT_SCRUB_DEFAULT,
-			    NETKIT_SCRUB_DEFAULT);
+			    &ifindex, NETKIT_SCRUB_DEFAULT,
+			    NETKIT_SCRUB_DEFAULT, 0);
 	if (err)
 		return;
 
@@ -733,8 +743,8 @@ static void serial_test_tc_netkit_pkt_type_mode(int mode)
 	struct bpf_link *link;
 
 	err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS,
-			    &ifindex, true, NETKIT_SCRUB_DEFAULT,
-			    NETKIT_SCRUB_DEFAULT);
+			    &ifindex, NETKIT_SCRUB_DEFAULT,
+			    NETKIT_SCRUB_DEFAULT, FLAG_SAME_NETNS);
 	if (err)
 		return;
 
@@ -799,7 +809,7 @@ void serial_test_tc_netkit_pkt_type(void)
 	serial_test_tc_netkit_pkt_type_mode(NETKIT_L3);
 }
 
-static void serial_test_tc_netkit_scrub_type(int scrub)
+static void serial_test_tc_netkit_scrub_type(int scrub, bool room)
 {
 	LIBBPF_OPTS(bpf_netkit_opts, optl);
 	struct test_tc_link *skel;
@@ -807,7 +817,8 @@ static void serial_test_tc_netkit_scrub_type(int scrub)
 	int err, ifindex;
 
 	err = create_netkit(NETKIT_L2, NETKIT_PASS, NETKIT_PASS,
-			    &ifindex, false, scrub, scrub);
+			    &ifindex, scrub, scrub,
+			    room ? FLAG_ADJUST_ROOM : 0);
 	if (err)
 		return;
 
@@ -842,6 +853,8 @@ static void serial_test_tc_netkit_scrub_type(int scrub)
 	ASSERT_EQ(skel->bss->seen_tc8, true, "seen_tc8");
 	ASSERT_EQ(skel->bss->mark, scrub == NETKIT_SCRUB_NONE ? MARK : 0, "mark");
 	ASSERT_EQ(skel->bss->prio, scrub == NETKIT_SCRUB_NONE ? PRIO : 0, "prio");
+	ASSERT_EQ(skel->bss->headroom, room ? NETKIT_HEADROOM : 0, "headroom");
+	ASSERT_EQ(skel->bss->tailroom, room ? NETKIT_TAILROOM : 0, "tailroom");
 cleanup:
 	test_tc_link__destroy(skel);
 
@@ -852,6 +865,6 @@ cleanup:
 
 void serial_test_tc_netkit_scrub(void)
 {
-	serial_test_tc_netkit_scrub_type(NETKIT_SCRUB_DEFAULT);
-	serial_test_tc_netkit_scrub_type(NETKIT_SCRUB_NONE);
+	serial_test_tc_netkit_scrub_type(NETKIT_SCRUB_DEFAULT, false);
+	serial_test_tc_netkit_scrub_type(NETKIT_SCRUB_NONE, true);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c
index e6a783c7f5db..937da9b7532a 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c
@@ -2,6 +2,14 @@
 #include <test_progs.h>
 #include <network_helpers.h>
 #include "test_xdp_context_test_run.skel.h"
+#include "test_xdp_meta.skel.h"
+
+#define TX_ADDR "10.0.0.1"
+#define RX_ADDR "10.0.0.2"
+#define RX_NAME "veth0"
+#define TX_NAME "veth1"
+#define TX_NETNS "xdp_context_tx"
+#define RX_NETNS "xdp_context_rx"
 
 void test_xdp_context_error(int prog_fd, struct bpf_test_run_opts opts,
 			    __u32 data_meta, __u32 data, __u32 data_end,
@@ -103,3 +111,82 @@ void test_xdp_context_test_run(void)
 
 	test_xdp_context_test_run__destroy(skel);
 }
+
+void test_xdp_context_functional(void)
+{
+	LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS);
+	LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 1);
+	struct netns_obj *rx_ns = NULL, *tx_ns = NULL;
+	struct bpf_program *tc_prog, *xdp_prog;
+	struct test_xdp_meta *skel = NULL;
+	struct nstoken *nstoken = NULL;
+	int rx_ifindex;
+	int ret;
+
+	tx_ns = netns_new(TX_NETNS, false);
+	if (!ASSERT_OK_PTR(tx_ns, "create tx_ns"))
+		return;
+
+	rx_ns = netns_new(RX_NETNS, false);
+	if (!ASSERT_OK_PTR(rx_ns, "create rx_ns"))
+		goto close;
+
+	SYS(close, "ip link add " RX_NAME " netns " RX_NETNS
+	    " type veth peer name " TX_NAME " netns " TX_NETNS);
+
+	nstoken = open_netns(RX_NETNS);
+	if (!ASSERT_OK_PTR(nstoken, "setns rx_ns"))
+		goto close;
+
+	SYS(close, "ip addr add " RX_ADDR "/24 dev " RX_NAME);
+	SYS(close, "ip link set dev " RX_NAME " up");
+
+	skel = test_xdp_meta__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open and load skeleton"))
+		goto close;
+
+	rx_ifindex = if_nametoindex(RX_NAME);
+	if (!ASSERT_GE(rx_ifindex, 0, "if_nametoindex rx"))
+		goto close;
+
+	tc_hook.ifindex = rx_ifindex;
+	ret = bpf_tc_hook_create(&tc_hook);
+	if (!ASSERT_OK(ret, "bpf_tc_hook_create"))
+		goto close;
+
+	tc_prog = bpf_object__find_program_by_name(skel->obj, "ing_cls");
+	if (!ASSERT_OK_PTR(tc_prog, "open ing_cls prog"))
+		goto close;
+
+	tc_opts.prog_fd = bpf_program__fd(tc_prog);
+	ret = bpf_tc_attach(&tc_hook, &tc_opts);
+	if (!ASSERT_OK(ret, "bpf_tc_attach"))
+		goto close;
+
+	xdp_prog = bpf_object__find_program_by_name(skel->obj, "ing_xdp");
+	if (!ASSERT_OK_PTR(xdp_prog, "open ing_xdp prog"))
+		goto close;
+
+	ret = bpf_xdp_attach(rx_ifindex,
+			     bpf_program__fd(xdp_prog),
+			     0, NULL);
+	if (!ASSERT_GE(ret, 0, "bpf_xdp_attach"))
+		goto close;
+
+	close_netns(nstoken);
+
+	nstoken = open_netns(TX_NETNS);
+	if (!ASSERT_OK_PTR(nstoken, "setns tx_ns"))
+		goto close;
+
+	SYS(close, "ip addr add " TX_ADDR "/24 dev " TX_NAME);
+	SYS(close, "ip link set dev " TX_NAME " up");
+	ASSERT_OK(SYS_NOFAIL("ping -c 1 " RX_ADDR), "ping");
+
+close:
+	close_netns(nstoken);
+	test_xdp_meta__destroy(skel);
+	netns_free(rx_ns);
+	netns_free(tx_ns);
+}
+
diff --git a/tools/testing/selftests/bpf/progs/test_tc_link.c b/tools/testing/selftests/bpf/progs/test_tc_link.c
index 10d825928499..630f12e51b07 100644
--- a/tools/testing/selftests/bpf/progs/test_tc_link.c
+++ b/tools/testing/selftests/bpf/progs/test_tc_link.c
@@ -8,6 +8,7 @@
 #include <linux/if_packet.h>
 #include <bpf/bpf_endian.h>
 #include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
 
 char LICENSE[] SEC("license") = "GPL";
 
@@ -27,6 +28,7 @@ bool seen_host;
 bool seen_mcast;
 
 int mark, prio;
+unsigned short headroom, tailroom;
 
 SEC("tc/ingress")
 int tc1(struct __sk_buff *skb)
@@ -104,11 +106,24 @@ out:
 	return TCX_PASS;
 }
 
+struct sk_buff {
+	struct net_device *dev;
+};
+
+struct net_device {
+	unsigned short needed_headroom;
+	unsigned short needed_tailroom;
+};
+
 SEC("tc/egress")
 int tc8(struct __sk_buff *skb)
 {
+	struct net_device *dev = BPF_CORE_READ((struct sk_buff *)skb, dev);
+
 	seen_tc8 = true;
 	mark = skb->mark;
 	prio = skb->priority;
+	headroom = BPF_CORE_READ(dev, needed_headroom);
+	tailroom = BPF_CORE_READ(dev, needed_tailroom);
 	return TCX_PASS;
 }
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_meta.c b/tools/testing/selftests/bpf/progs/test_xdp_meta.c
index a7c4a7d49fe6..fe2d71ae0e71 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp_meta.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp_meta.c
@@ -8,7 +8,7 @@
 #define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1)
 #define ctx_ptr(ctx, mem) (void *)(unsigned long)ctx->mem
 
-SEC("t")
+SEC("tc")
 int ing_cls(struct __sk_buff *ctx)
 {
 	__u8 *data, *data_meta, *data_end;
@@ -28,7 +28,7 @@ int ing_cls(struct __sk_buff *ctx)
 	return diff ? TC_ACT_SHOT : TC_ACT_OK;
 }
 
-SEC("x")
+SEC("xdp")
 int ing_xdp(struct xdp_md *ctx)
 {
 	__u8 *data, *data_meta, *data_end;
diff --git a/tools/testing/selftests/bpf/test_xdp_meta.sh b/tools/testing/selftests/bpf/test_xdp_meta.sh
deleted file mode 100755
index 2740322c1878..000000000000
--- a/tools/testing/selftests/bpf/test_xdp_meta.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/sh
-
-BPF_FILE="test_xdp_meta.bpf.o"
-# Kselftest framework requirement - SKIP code is 4.
-readonly KSFT_SKIP=4
-readonly NS1="ns1-$(mktemp -u XXXXXX)"
-readonly NS2="ns2-$(mktemp -u XXXXXX)"
-
-cleanup()
-{
-	if [ "$?" = "0" ]; then
-		echo "selftests: test_xdp_meta [PASS]";
-	else
-		echo "selftests: test_xdp_meta [FAILED]";
-	fi
-
-	set +e
-	ip link del veth1 2> /dev/null
-	ip netns del ${NS1} 2> /dev/null
-	ip netns del ${NS2} 2> /dev/null
-}
-
-ip link set dev lo xdp off 2>/dev/null > /dev/null
-if [ $? -ne 0 ];then
-	echo "selftests: [SKIP] Could not run test without the ip xdp support"
-	exit $KSFT_SKIP
-fi
-set -e
-
-ip netns add ${NS1}
-ip netns add ${NS2}
-
-trap cleanup 0 2 3 6 9
-
-ip link add veth1 type veth peer name veth2
-
-ip link set veth1 netns ${NS1}
-ip link set veth2 netns ${NS2}
-
-ip netns exec ${NS1} ip addr add 10.1.1.11/24 dev veth1
-ip netns exec ${NS2} ip addr add 10.1.1.22/24 dev veth2
-
-ip netns exec ${NS1} tc qdisc add dev veth1 clsact
-ip netns exec ${NS2} tc qdisc add dev veth2 clsact
-
-ip netns exec ${NS1} tc filter add dev veth1 ingress bpf da obj ${BPF_FILE} sec t
-ip netns exec ${NS2} tc filter add dev veth2 ingress bpf da obj ${BPF_FILE} sec t
-
-ip netns exec ${NS1} ip link set dev veth1 xdp obj ${BPF_FILE} sec x
-ip netns exec ${NS2} ip link set dev veth2 xdp obj ${BPF_FILE} sec x
-
-ip netns exec ${NS1} ip link set dev veth1 up
-ip netns exec ${NS2} ip link set dev veth2 up
-
-ip netns exec ${NS1} ping -c 1 10.1.1.22
-ip netns exec ${NS2} ping -c 1 10.1.1.11
-
-exit 0
diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c
index 6f9956eed797..e38675d9b118 100644
--- a/tools/testing/selftests/bpf/xdp_hw_metadata.c
+++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c
@@ -79,7 +79,7 @@ static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id)
 		.fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
 		.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
 		.frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
-		.flags = XSK_UMEM__DEFAULT_FLAGS,
+		.flags = XDP_UMEM_TX_METADATA_LEN,
 		.tx_metadata_len = sizeof(struct xsk_tx_metadata),
 	};
 	__u32 idx = 0;
@@ -551,6 +551,7 @@ static void hwtstamp_enable(const char *ifname)
 {
 	struct hwtstamp_config cfg = {
 		.rx_filter = HWTSTAMP_FILTER_ALL,
+		.tx_type = HWTSTAMP_TX_ON,
 	};
 
 	hwtstamp_ioctl(SIOCGHWTSTAMP, ifname, &saved_hwtstamp_cfg);
diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile
index 0fec8f9801ad..469179c18935 100644
--- a/tools/testing/selftests/drivers/net/Makefile
+++ b/tools/testing/selftests/drivers/net/Makefile
@@ -1,11 +1,13 @@
 # SPDX-License-Identifier: GPL-2.0
 
 TEST_INCLUDES := $(wildcard lib/py/*.py) \
+		 $(wildcard lib/sh/*.sh) \
 		 ../../net/net_helper.sh \
 		 ../../net/lib.sh \
 
 TEST_PROGS := \
 	netcons_basic.sh \
+	netcons_overflow.sh \
 	ping.py \
 	queues.py \
 	stats.py \
diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile
index 03a089165d3f..2b10854e4b1e 100644
--- a/tools/testing/selftests/drivers/net/bonding/Makefile
+++ b/tools/testing/selftests/drivers/net/bonding/Makefile
@@ -10,7 +10,7 @@ TEST_PROGS := \
 	mode-2-recovery-updelay.sh \
 	bond_options.sh \
 	bond-eth-type-change.sh \
-	bond_macvlan.sh
+	bond_macvlan_ipvlan.sh
 
 TEST_FILES := \
 	lag_lib.sh \
diff --git a/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh b/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh
deleted file mode 100755
index b609fb6231f4..000000000000
--- a/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh
+++ /dev/null
@@ -1,99 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-#
-# Test macvlan over balance-alb
-
-lib_dir=$(dirname "$0")
-source ${lib_dir}/bond_topo_2d1c.sh
-
-m1_ns="m1-$(mktemp -u XXXXXX)"
-m2_ns="m1-$(mktemp -u XXXXXX)"
-m1_ip4="192.0.2.11"
-m1_ip6="2001:db8::11"
-m2_ip4="192.0.2.12"
-m2_ip6="2001:db8::12"
-
-cleanup()
-{
-	ip -n ${m1_ns} link del macv0
-	ip netns del ${m1_ns}
-	ip -n ${m2_ns} link del macv0
-	ip netns del ${m2_ns}
-
-	client_destroy
-	server_destroy
-	gateway_destroy
-}
-
-check_connection()
-{
-	local ns=${1}
-	local target=${2}
-	local message=${3:-"macvlan_over_bond"}
-	RET=0
-
-
-	ip netns exec ${ns} ping ${target} -c 4 -i 0.1 &>/dev/null
-	check_err $? "ping failed"
-	log_test "$mode: $message"
-}
-
-macvlan_over_bond()
-{
-	local param="$1"
-	RET=0
-
-	# setup new bond mode
-	bond_reset "${param}"
-
-	ip -n ${s_ns} link add link bond0 name macv0 type macvlan mode bridge
-	ip -n ${s_ns} link set macv0 netns ${m1_ns}
-	ip -n ${m1_ns} link set dev macv0 up
-	ip -n ${m1_ns} addr add ${m1_ip4}/24 dev macv0
-	ip -n ${m1_ns} addr add ${m1_ip6}/24 dev macv0
-
-	ip -n ${s_ns} link add link bond0 name macv0 type macvlan mode bridge
-	ip -n ${s_ns} link set macv0 netns ${m2_ns}
-	ip -n ${m2_ns} link set dev macv0 up
-	ip -n ${m2_ns} addr add ${m2_ip4}/24 dev macv0
-	ip -n ${m2_ns} addr add ${m2_ip6}/24 dev macv0
-
-	sleep 2
-
-	check_connection "${c_ns}" "${s_ip4}" "IPv4: client->server"
-	check_connection "${c_ns}" "${s_ip6}" "IPv6: client->server"
-	check_connection "${c_ns}" "${m1_ip4}" "IPv4: client->macvlan_1"
-	check_connection "${c_ns}" "${m1_ip6}" "IPv6: client->macvlan_1"
-	check_connection "${c_ns}" "${m2_ip4}" "IPv4: client->macvlan_2"
-	check_connection "${c_ns}" "${m2_ip6}" "IPv6: client->macvlan_2"
-	check_connection "${m1_ns}" "${m2_ip4}" "IPv4: macvlan_1->macvlan_2"
-	check_connection "${m1_ns}" "${m2_ip6}" "IPv6: macvlan_1->macvlan_2"
-
-
-	sleep 5
-
-	check_connection "${s_ns}" "${c_ip4}" "IPv4: server->client"
-	check_connection "${s_ns}" "${c_ip6}" "IPv6: server->client"
-	check_connection "${m1_ns}" "${c_ip4}" "IPv4: macvlan_1->client"
-	check_connection "${m1_ns}" "${c_ip6}" "IPv6: macvlan_1->client"
-	check_connection "${m2_ns}" "${c_ip4}" "IPv4: macvlan_2->client"
-	check_connection "${m2_ns}" "${c_ip6}" "IPv6: macvlan_2->client"
-	check_connection "${m2_ns}" "${m1_ip4}" "IPv4: macvlan_2->macvlan_2"
-	check_connection "${m2_ns}" "${m1_ip6}" "IPv6: macvlan_2->macvlan_2"
-
-	ip -n ${c_ns} neigh flush dev eth0
-}
-
-trap cleanup EXIT
-
-setup_prepare
-ip netns add ${m1_ns}
-ip netns add ${m2_ns}
-
-modes="active-backup balance-tlb balance-alb"
-
-for mode in $modes; do
-	macvlan_over_bond "mode $mode"
-done
-
-exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh b/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh
new file mode 100755
index 000000000000..c4711272fe45
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test macvlan/ipvlan over bond
+
+lib_dir=$(dirname "$0")
+source ${lib_dir}/bond_topo_2d1c.sh
+
+xvlan1_ns="xvlan1-$(mktemp -u XXXXXX)"
+xvlan2_ns="xvlan2-$(mktemp -u XXXXXX)"
+xvlan1_ip4="192.0.2.11"
+xvlan1_ip6="2001:db8::11"
+xvlan2_ip4="192.0.2.12"
+xvlan2_ip6="2001:db8::12"
+
+cleanup()
+{
+	client_destroy
+	server_destroy
+	gateway_destroy
+
+	ip netns del ${xvlan1_ns}
+	ip netns del ${xvlan2_ns}
+}
+
+check_connection()
+{
+	local ns=${1}
+	local target=${2}
+	local message=${3}
+	RET=0
+
+	ip netns exec ${ns} ping ${target} -c 4 -i 0.1 &>/dev/null
+	check_err $? "ping failed"
+	log_test "${bond_mode}/${xvlan_type}_${xvlan_mode}: ${message}"
+}
+
+xvlan_over_bond()
+{
+	local param="$1"
+	local xvlan_type="$2"
+	local xvlan_mode="$3"
+	RET=0
+
+	# setup new bond mode
+	bond_reset "${param}"
+
+	ip -n ${s_ns} link add link bond0 name ${xvlan_type}0 type ${xvlan_type} mode ${xvlan_mode}
+	ip -n ${s_ns} link set ${xvlan_type}0 netns ${xvlan1_ns}
+	ip -n ${xvlan1_ns} link set dev ${xvlan_type}0 up
+	ip -n ${xvlan1_ns} addr add ${xvlan1_ip4}/24 dev ${xvlan_type}0
+	ip -n ${xvlan1_ns} addr add ${xvlan1_ip6}/24 dev ${xvlan_type}0
+
+	ip -n ${s_ns} link add link bond0 name ${xvlan_type}0 type ${xvlan_type} mode ${xvlan_mode}
+	ip -n ${s_ns} link set ${xvlan_type}0 netns ${xvlan2_ns}
+	ip -n ${xvlan2_ns} link set dev ${xvlan_type}0 up
+	ip -n ${xvlan2_ns} addr add ${xvlan2_ip4}/24 dev ${xvlan_type}0
+	ip -n ${xvlan2_ns} addr add ${xvlan2_ip6}/24 dev ${xvlan_type}0
+
+	sleep 2
+
+	check_connection "${c_ns}" "${s_ip4}" "IPv4: client->server"
+	check_connection "${c_ns}" "${s_ip6}" "IPv6: client->server"
+	check_connection "${c_ns}" "${xvlan1_ip4}" "IPv4: client->${xvlan_type}_1"
+	check_connection "${c_ns}" "${xvlan1_ip6}" "IPv6: client->${xvlan_type}_1"
+	check_connection "${c_ns}" "${xvlan2_ip4}" "IPv4: client->${xvlan_type}_2"
+	check_connection "${c_ns}" "${xvlan2_ip6}" "IPv6: client->${xvlan_type}_2"
+	check_connection "${xvlan1_ns}" "${xvlan2_ip4}" "IPv4: ${xvlan_type}_1->${xvlan_type}_2"
+	check_connection "${xvlan1_ns}" "${xvlan2_ip6}" "IPv6: ${xvlan_type}_1->${xvlan_type}_2"
+
+	check_connection "${s_ns}" "${c_ip4}" "IPv4: server->client"
+	check_connection "${s_ns}" "${c_ip6}" "IPv6: server->client"
+	check_connection "${xvlan1_ns}" "${c_ip4}" "IPv4: ${xvlan_type}_1->client"
+	check_connection "${xvlan1_ns}" "${c_ip6}" "IPv6: ${xvlan_type}_1->client"
+	check_connection "${xvlan2_ns}" "${c_ip4}" "IPv4: ${xvlan_type}_2->client"
+	check_connection "${xvlan2_ns}" "${c_ip6}" "IPv6: ${xvlan_type}_2->client"
+	check_connection "${xvlan2_ns}" "${xvlan1_ip4}" "IPv4: ${xvlan_type}_2->${xvlan_type}_1"
+	check_connection "${xvlan2_ns}" "${xvlan1_ip6}" "IPv6: ${xvlan_type}_2->${xvlan_type}_1"
+
+	ip -n ${c_ns} neigh flush dev eth0
+}
+
+trap cleanup EXIT
+
+setup_prepare
+ip netns add ${xvlan1_ns}
+ip netns add ${xvlan2_ns}
+
+bond_modes="active-backup balance-tlb balance-alb"
+
+for bond_mode in ${bond_modes}; do
+	xvlan_over_bond "mode ${bond_mode}" macvlan bridge
+	xvlan_over_bond "mode ${bond_mode}" ipvlan  l2
+done
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config
index 899d7fb6ea8e..dad4e5fda4db 100644
--- a/tools/testing/selftests/drivers/net/bonding/config
+++ b/tools/testing/selftests/drivers/net/bonding/config
@@ -3,6 +3,7 @@ CONFIG_BRIDGE=y
 CONFIG_DUMMY=y
 CONFIG_IPV6=y
 CONFIG_MACVLAN=y
+CONFIG_IPVLAN=y
 CONFIG_NET_ACT_GACT=y
 CONFIG_NET_CLS_FLOWER=y
 CONFIG_NET_SCH_INGRESS=y
diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py
index fea343f209ea..987e452d3a45 100644
--- a/tools/testing/selftests/drivers/net/lib/py/env.py
+++ b/tools/testing/selftests/drivers/net/lib/py/env.py
@@ -48,6 +48,7 @@ class NetDrvEnv:
         else:
             self._ns = NetdevSimDev(**kwargs)
             self.dev = self._ns.nsims[0].dev
+        self.ifname = self.dev['ifname']
         self.ifindex = self.dev['ifindex']
 
     def __enter__(self):
diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
new file mode 100644
index 000000000000..3acaba41ac7b
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
@@ -0,0 +1,225 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This file contains functions and helpers to support the netconsole
+# selftests
+#
+# Author: Breno Leitao <leitao@debian.org>
+
+set -euo pipefail
+
+LIBDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+
+SRCIF="" # to be populated later
+SRCIP=192.0.2.1
+DSTIF="" # to be populated later
+DSTIP=192.0.2.2
+
+PORT="6666"
+MSG="netconsole selftest"
+USERDATA_KEY="key"
+USERDATA_VALUE="value"
+TARGET=$(mktemp -u netcons_XXXXX)
+DEFAULT_PRINTK_VALUES=$(cat /proc/sys/kernel/printk)
+NETCONS_CONFIGFS="/sys/kernel/config/netconsole"
+NETCONS_PATH="${NETCONS_CONFIGFS}"/"${TARGET}"
+# NAMESPACE will be populated by setup_ns with a random value
+NAMESPACE=""
+
+# IDs for netdevsim
+NSIM_DEV_1_ID=$((256 + RANDOM % 256))
+NSIM_DEV_2_ID=$((512 + RANDOM % 256))
+NSIM_DEV_SYS_NEW="/sys/bus/netdevsim/new_device"
+
+# Used to create and delete namespaces
+source "${LIBDIR}"/../../../../net/lib.sh
+source "${LIBDIR}"/../../../../net/net_helper.sh
+
+# Create netdevsim interfaces
+create_ifaces() {
+
+	echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_NEW"
+	echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_NEW"
+	udevadm settle 2> /dev/null || true
+
+	local NSIM1=/sys/bus/netdevsim/devices/netdevsim"$NSIM_DEV_1_ID"
+	local NSIM2=/sys/bus/netdevsim/devices/netdevsim"$NSIM_DEV_2_ID"
+
+	# These are global variables
+	SRCIF=$(find "$NSIM1"/net -maxdepth 1 -type d ! \
+		-path "$NSIM1"/net -exec basename {} \;)
+	DSTIF=$(find "$NSIM2"/net -maxdepth 1 -type d ! \
+		-path "$NSIM2"/net -exec basename {} \;)
+}
+
+link_ifaces() {
+	local NSIM_DEV_SYS_LINK="/sys/bus/netdevsim/link_device"
+	local SRCIF_IFIDX=$(cat /sys/class/net/"$SRCIF"/ifindex)
+	local DSTIF_IFIDX=$(cat /sys/class/net/"$DSTIF"/ifindex)
+
+	exec {NAMESPACE_FD}</var/run/netns/"${NAMESPACE}"
+	exec {INITNS_FD}</proc/self/ns/net
+
+	# Bind the dst interface to namespace
+	ip link set "${DSTIF}" netns "${NAMESPACE}"
+
+	# Linking one device to the other one (on the other namespace}
+	if ! echo "${INITNS_FD}:$SRCIF_IFIDX $NAMESPACE_FD:$DSTIF_IFIDX"  > $NSIM_DEV_SYS_LINK
+	then
+		echo "linking netdevsim1 with netdevsim2 should succeed"
+		cleanup
+		exit "${ksft_skip}"
+	fi
+}
+
+function configure_ip() {
+	# Configure the IPs for both interfaces
+	ip netns exec "${NAMESPACE}" ip addr add "${DSTIP}"/24 dev "${DSTIF}"
+	ip netns exec "${NAMESPACE}" ip link set "${DSTIF}" up
+
+	ip addr add "${SRCIP}"/24 dev "${SRCIF}"
+	ip link set "${SRCIF}" up
+}
+
+function set_network() {
+	# setup_ns function is coming from lib.sh
+	setup_ns NAMESPACE
+
+	# Create both interfaces, and assign the destination to a different
+	# namespace
+	create_ifaces
+
+	# Link both interfaces back to back
+	link_ifaces
+
+	configure_ip
+}
+
+function create_dynamic_target() {
+	DSTMAC=$(ip netns exec "${NAMESPACE}" \
+		 ip link show "${DSTIF}" | awk '/ether/ {print $2}')
+
+	# Create a dynamic target
+	mkdir "${NETCONS_PATH}"
+
+	echo "${DSTIP}" > "${NETCONS_PATH}"/remote_ip
+	echo "${SRCIP}" > "${NETCONS_PATH}"/local_ip
+	echo "${DSTMAC}" > "${NETCONS_PATH}"/remote_mac
+	echo "${SRCIF}" > "${NETCONS_PATH}"/dev_name
+
+	echo 1 > "${NETCONS_PATH}"/enabled
+}
+
+function cleanup() {
+	local NSIM_DEV_SYS_DEL="/sys/bus/netdevsim/del_device"
+
+	# delete netconsole dynamic reconfiguration
+	echo 0 > "${NETCONS_PATH}"/enabled
+	# Remove all the keys that got created during the selftest
+	find "${NETCONS_PATH}/userdata/" -mindepth 1 -type d -delete
+	# Remove the configfs entry
+	rmdir "${NETCONS_PATH}"
+
+	# Delete netdevsim devices
+	echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_DEL"
+	echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_DEL"
+
+	# this is coming from lib.sh
+	cleanup_all_ns
+
+	# Restoring printk configurations
+	echo "${DEFAULT_PRINTK_VALUES}" > /proc/sys/kernel/printk
+}
+
+function set_user_data() {
+	if [[ ! -d "${NETCONS_PATH}""/userdata" ]]
+	then
+		echo "Userdata path not available in ${NETCONS_PATH}/userdata"
+		exit "${ksft_skip}"
+	fi
+
+	KEY_PATH="${NETCONS_PATH}/userdata/${USERDATA_KEY}"
+	mkdir -p "${KEY_PATH}"
+	VALUE_PATH="${KEY_PATH}""/value"
+	echo "${USERDATA_VALUE}" > "${VALUE_PATH}"
+}
+
+function listen_port_and_save_to() {
+	local OUTPUT=${1}
+	# Just wait for 2 seconds
+	timeout 2 ip netns exec "${NAMESPACE}" \
+		socat UDP-LISTEN:"${PORT}",fork "${OUTPUT}"
+}
+
+function validate_result() {
+	local TMPFILENAME="$1"
+
+	# TMPFILENAME will contain something like:
+	# 6.11.1-0_fbk0_rc13_509_g30d75cea12f7,13,1822,115075213798,-;netconsole selftest: netcons_gtJHM
+	#  key=value
+
+	# Check if the file exists
+	if [ ! -f "$TMPFILENAME" ]; then
+		echo "FAIL: File was not generated." >&2
+		exit "${ksft_fail}"
+	fi
+
+	if ! grep -q "${MSG}" "${TMPFILENAME}"; then
+		echo "FAIL: ${MSG} not found in ${TMPFILENAME}" >&2
+		cat "${TMPFILENAME}" >&2
+		exit "${ksft_fail}"
+	fi
+
+	if ! grep -q "${USERDATA_KEY}=${USERDATA_VALUE}" "${TMPFILENAME}"; then
+		echo "FAIL: ${USERDATA_KEY}=${USERDATA_VALUE} not found in ${TMPFILENAME}" >&2
+		cat "${TMPFILENAME}" >&2
+		exit "${ksft_fail}"
+	fi
+
+	# Delete the file once it is validated, otherwise keep it
+	# for debugging purposes
+	rm "${TMPFILENAME}"
+	exit "${ksft_pass}"
+}
+
+function check_for_dependencies() {
+	if [ "$(id -u)" -ne 0 ]; then
+		echo "This test must be run as root" >&2
+		exit "${ksft_skip}"
+	fi
+
+	if ! which socat > /dev/null ; then
+		echo "SKIP: socat(1) is not available" >&2
+		exit "${ksft_skip}"
+	fi
+
+	if ! which ip > /dev/null ; then
+		echo "SKIP: ip(1) is not available" >&2
+		exit "${ksft_skip}"
+	fi
+
+	if ! which udevadm > /dev/null ; then
+		echo "SKIP: udevadm(1) is not available" >&2
+		exit "${ksft_skip}"
+	fi
+
+	if [ ! -f "${NSIM_DEV_SYS_NEW}" ]; then
+		echo "SKIP: file ${NSIM_DEV_SYS_NEW} does not exist. Check if CONFIG_NETDEVSIM is enabled" >&2
+		exit "${ksft_skip}"
+	fi
+
+	if [ ! -d "${NETCONS_CONFIGFS}" ]; then
+		echo "SKIP: directory ${NETCONS_CONFIGFS} does not exist. Check if NETCONSOLE_DYNAMIC is enabled" >&2
+		exit "${ksft_skip}"
+	fi
+
+	if ip link show "${DSTIF}" 2> /dev/null; then
+		echo "SKIP: interface ${DSTIF} exists in the system. Not overwriting it." >&2
+		exit "${ksft_skip}"
+	fi
+
+	if ip addr list | grep -E "inet.*(${SRCIP}|${DSTIP})" 2> /dev/null; then
+		echo "SKIP: IPs already in use. Skipping it" >&2
+		exit "${ksft_skip}"
+	fi
+}
diff --git a/tools/testing/selftests/drivers/net/netcons_basic.sh b/tools/testing/selftests/drivers/net/netcons_basic.sh
index b175f4d966e5..fe765da498e8 100755
--- a/tools/testing/selftests/drivers/net/netcons_basic.sh
+++ b/tools/testing/selftests/drivers/net/netcons_basic.sh
@@ -18,224 +18,8 @@ set -euo pipefail
 
 SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
 
-# Simple script to test dynamic targets in netconsole
-SRCIF="" # to be populated later
-SRCIP=192.0.2.1
-DSTIF="" # to be populated later
-DSTIP=192.0.2.2
+source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh
 
-PORT="6666"
-MSG="netconsole selftest"
-USERDATA_KEY="key"
-USERDATA_VALUE="value"
-TARGET=$(mktemp -u netcons_XXXXX)
-DEFAULT_PRINTK_VALUES=$(cat /proc/sys/kernel/printk)
-NETCONS_CONFIGFS="/sys/kernel/config/netconsole"
-NETCONS_PATH="${NETCONS_CONFIGFS}"/"${TARGET}"
-KEY_PATH="${NETCONS_PATH}/userdata/${USERDATA_KEY}"
-# NAMESPACE will be populated by setup_ns with a random value
-NAMESPACE=""
-
-# IDs for netdevsim
-NSIM_DEV_1_ID=$((256 + RANDOM % 256))
-NSIM_DEV_2_ID=$((512 + RANDOM % 256))
-NSIM_DEV_SYS_NEW="/sys/bus/netdevsim/new_device"
-
-# Used to create and delete namespaces
-source "${SCRIPTDIR}"/../../net/lib.sh
-source "${SCRIPTDIR}"/../../net/net_helper.sh
-
-# Create netdevsim interfaces
-create_ifaces() {
-
-	echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_NEW"
-	echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_NEW"
-	udevadm settle 2> /dev/null || true
-
-	local NSIM1=/sys/bus/netdevsim/devices/netdevsim"$NSIM_DEV_1_ID"
-	local NSIM2=/sys/bus/netdevsim/devices/netdevsim"$NSIM_DEV_2_ID"
-
-	# These are global variables
-	SRCIF=$(find "$NSIM1"/net -maxdepth 1 -type d ! \
-		-path "$NSIM1"/net -exec basename {} \;)
-	DSTIF=$(find "$NSIM2"/net -maxdepth 1 -type d ! \
-		-path "$NSIM2"/net -exec basename {} \;)
-}
-
-link_ifaces() {
-	local NSIM_DEV_SYS_LINK="/sys/bus/netdevsim/link_device"
-	local SRCIF_IFIDX=$(cat /sys/class/net/"$SRCIF"/ifindex)
-	local DSTIF_IFIDX=$(cat /sys/class/net/"$DSTIF"/ifindex)
-
-	exec {NAMESPACE_FD}</var/run/netns/"${NAMESPACE}"
-	exec {INITNS_FD}</proc/self/ns/net
-
-	# Bind the dst interface to namespace
-	ip link set "${DSTIF}" netns "${NAMESPACE}"
-
-	# Linking one device to the other one (on the other namespace}
-	if ! echo "${INITNS_FD}:$SRCIF_IFIDX $NAMESPACE_FD:$DSTIF_IFIDX"  > $NSIM_DEV_SYS_LINK
-	then
-		echo "linking netdevsim1 with netdevsim2 should succeed"
-		cleanup
-		exit "${ksft_skip}"
-	fi
-}
-
-function configure_ip() {
-	# Configure the IPs for both interfaces
-	ip netns exec "${NAMESPACE}" ip addr add "${DSTIP}"/24 dev "${DSTIF}"
-	ip netns exec "${NAMESPACE}" ip link set "${DSTIF}" up
-
-	ip addr add "${SRCIP}"/24 dev "${SRCIF}"
-	ip link set "${SRCIF}" up
-}
-
-function set_network() {
-	# setup_ns function is coming from lib.sh
-	setup_ns NAMESPACE
-
-	# Create both interfaces, and assign the destination to a different
-	# namespace
-	create_ifaces
-
-	# Link both interfaces back to back
-	link_ifaces
-
-	configure_ip
-}
-
-function create_dynamic_target() {
-	DSTMAC=$(ip netns exec "${NAMESPACE}" \
-		 ip link show "${DSTIF}" | awk '/ether/ {print $2}')
-
-	# Create a dynamic target
-	mkdir "${NETCONS_PATH}"
-
-	echo "${DSTIP}" > "${NETCONS_PATH}"/remote_ip
-	echo "${SRCIP}" > "${NETCONS_PATH}"/local_ip
-	echo "${DSTMAC}" > "${NETCONS_PATH}"/remote_mac
-	echo "${SRCIF}" > "${NETCONS_PATH}"/dev_name
-
-	echo 1 > "${NETCONS_PATH}"/enabled
-}
-
-function cleanup() {
-	local NSIM_DEV_SYS_DEL="/sys/bus/netdevsim/del_device"
-
-	# delete netconsole dynamic reconfiguration
-	echo 0 > "${NETCONS_PATH}"/enabled
-	# Remove key
-	rmdir "${KEY_PATH}"
-	# Remove the configfs entry
-	rmdir "${NETCONS_PATH}"
-
-	# Delete netdevsim devices
-	echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_DEL"
-	echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_DEL"
-
-	# this is coming from lib.sh
-	cleanup_all_ns
-
-	# Restoring printk configurations
-	echo "${DEFAULT_PRINTK_VALUES}" > /proc/sys/kernel/printk
-}
-
-function set_user_data() {
-	if [[ ! -d "${NETCONS_PATH}""/userdata" ]]
-	then
-		echo "Userdata path not available in ${NETCONS_PATH}/userdata"
-		exit "${ksft_skip}"
-	fi
-
-	mkdir -p "${KEY_PATH}"
-	VALUE_PATH="${KEY_PATH}""/value"
-	echo "${USERDATA_VALUE}" > "${VALUE_PATH}"
-}
-
-function listen_port_and_save_to() {
-	local OUTPUT=${1}
-	# Just wait for 2 seconds
-	timeout 2 ip netns exec "${NAMESPACE}" \
-		socat UDP-LISTEN:"${PORT}",fork "${OUTPUT}"
-}
-
-function validate_result() {
-	local TMPFILENAME="$1"
-
-	# TMPFILENAME will contain something like:
-	# 6.11.1-0_fbk0_rc13_509_g30d75cea12f7,13,1822,115075213798,-;netconsole selftest: netcons_gtJHM
-	#  key=value
-
-	# Check if the file exists
-	if [ ! -f "$TMPFILENAME" ]; then
-		echo "FAIL: File was not generated." >&2
-		exit "${ksft_fail}"
-	fi
-
-	if ! grep -q "${MSG}" "${TMPFILENAME}"; then
-		echo "FAIL: ${MSG} not found in ${TMPFILENAME}" >&2
-		cat "${TMPFILENAME}" >&2
-		exit "${ksft_fail}"
-	fi
-
-	if ! grep -q "${USERDATA_KEY}=${USERDATA_VALUE}" "${TMPFILENAME}"; then
-		echo "FAIL: ${USERDATA_KEY}=${USERDATA_VALUE} not found in ${TMPFILENAME}" >&2
-		cat "${TMPFILENAME}" >&2
-		exit "${ksft_fail}"
-	fi
-
-	# Delete the file once it is validated, otherwise keep it
-	# for debugging purposes
-	rm "${TMPFILENAME}"
-	exit "${ksft_pass}"
-}
-
-function check_for_dependencies() {
-	if [ "$(id -u)" -ne 0 ]; then
-		echo "This test must be run as root" >&2
-		exit "${ksft_skip}"
-	fi
-
-	if ! which socat > /dev/null ; then
-		echo "SKIP: socat(1) is not available" >&2
-		exit "${ksft_skip}"
-	fi
-
-	if ! which ip > /dev/null ; then
-		echo "SKIP: ip(1) is not available" >&2
-		exit "${ksft_skip}"
-	fi
-
-	if ! which udevadm > /dev/null ; then
-		echo "SKIP: udevadm(1) is not available" >&2
-		exit "${ksft_skip}"
-	fi
-
-	if [ ! -f "${NSIM_DEV_SYS_NEW}" ]; then
-		echo "SKIP: file ${NSIM_DEV_SYS_NEW} does not exist. Check if CONFIG_NETDEVSIM is enabled" >&2
-		exit "${ksft_skip}"
-	fi
-
-	if [ ! -d "${NETCONS_CONFIGFS}" ]; then
-		echo "SKIP: directory ${NETCONS_CONFIGFS} does not exist. Check if NETCONSOLE_DYNAMIC is enabled" >&2
-		exit "${ksft_skip}"
-	fi
-
-	if ip link show "${DSTIF}" 2> /dev/null; then
-		echo "SKIP: interface ${DSTIF} exists in the system. Not overwriting it." >&2
-		exit "${ksft_skip}"
-	fi
-
-	if ip addr list | grep -E "inet.*(${SRCIP}|${DSTIP})" 2> /dev/null; then
-		echo "SKIP: IPs already in use. Skipping it" >&2
-		exit "${ksft_skip}"
-	fi
-}
-
-# ========== #
-# Start here #
-# ========== #
 modprobe netdevsim 2> /dev/null || true
 modprobe netconsole 2> /dev/null || true
 
diff --git a/tools/testing/selftests/drivers/net/netcons_overflow.sh b/tools/testing/selftests/drivers/net/netcons_overflow.sh
new file mode 100755
index 000000000000..29bad56448a2
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netcons_overflow.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test verifies that users can successfully create up to
+# MAX_USERDATA_ITEMS userdata entries without encountering any failures.
+#
+# Additionally, it tests for expected failure when attempting to exceed this
+# maximum limit.
+#
+# Author: Breno Leitao <leitao@debian.org>
+
+set -euo pipefail
+
+SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+
+source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh
+# This is coming from netconsole code. Check for it in drivers/net/netconsole.c
+MAX_USERDATA_ITEMS=16
+
+# Function to create userdata entries
+function create_userdata_max_entries() {
+	# All these keys should be created without any error
+	for i in $(seq $MAX_USERDATA_ITEMS)
+	do
+		# USERDATA_KEY is used by set_user_data
+		USERDATA_KEY="key"${i}
+		set_user_data
+	done
+}
+
+# Function to verify the entry limit
+function verify_entry_limit() {
+	# Allowing the test to fail without exiting, since the next command
+	# will fail
+	set +e
+	mkdir "${NETCONS_PATH}/userdata/key_that_will_fail" 2> /dev/null
+	ret="$?"
+	set -e
+	if [ "$ret" -eq 0 ];
+	then
+		echo "Adding more than ${MAX_USERDATA_ITEMS} entries in userdata should fail, but it didn't" >&2
+		ls "${NETCONS_PATH}/userdata/" >&2
+		exit "${ksft_fail}"
+	fi
+}
+
+# ========== #
+# Start here #
+# ========== #
+
+modprobe netdevsim 2> /dev/null || true
+modprobe netconsole 2> /dev/null || true
+
+# Check for basic system dependency and exit if not found
+check_for_dependencies
+
+# Remove the namespace, interfaces and netconsole target on exit
+trap cleanup EXIT
+# Create one namespace and two interfaces
+set_network
+# Create a dynamic target for netconsole
+create_dynamic_target
+# populate the maximum number of supported keys in userdata
+create_userdata_max_entries
+# Verify an additional entry is not allowed
+verify_entry_limit
+exit "${ksft_pass}"
diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py
index 031ac9def6c0..efcc1e10575b 100755
--- a/tools/testing/selftests/drivers/net/stats.py
+++ b/tools/testing/selftests/drivers/net/stats.py
@@ -2,12 +2,15 @@
 # SPDX-License-Identifier: GPL-2.0
 
 import errno
+import subprocess
+import time
 from lib.py import ksft_run, ksft_exit, ksft_pr
-from lib.py import ksft_ge, ksft_eq, ksft_in, ksft_true, ksft_raises, KsftSkipEx, KsftXfailEx
+from lib.py import ksft_ge, ksft_eq, ksft_is, ksft_in, ksft_lt, ksft_true, ksft_raises
+from lib.py import KsftSkipEx, KsftXfailEx
 from lib.py import ksft_disruptive
 from lib.py import EthtoolFamily, NetdevFamily, RtnlFamily, NlError
 from lib.py import NetDrvEnv
-from lib.py import ip, defer
+from lib.py import cmd, ip, defer
 
 ethnl = EthtoolFamily()
 netfam = NetdevFamily()
@@ -174,10 +177,95 @@ def check_down(cfg) -> None:
     netfam.qstats_get({"ifindex": cfg.ifindex, "scope": "queue"}, dump=True)
 
 
+def __run_inf_loop(body):
+    body = body.strip()
+    if body[-1] != ';':
+        body += ';'
+
+    return subprocess.Popen(f"while true; do {body} done", shell=True,
+                            stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+
+def __stats_increase_sanely(old, new) -> None:
+    for k in old.keys():
+        ksft_ge(new[k], old[k])
+        ksft_lt(new[k] - old[k], 1 << 31, comment="likely wrapping error")
+
+
+def procfs_hammer(cfg) -> None:
+    """
+    Reading stats via procfs only holds the RCU lock, which is not an exclusive
+    lock, make sure drivers can handle parallel reads of stats.
+    """
+    one = __run_inf_loop("cat /proc/net/dev")
+    defer(one.kill)
+    two = __run_inf_loop("cat /proc/net/dev")
+    defer(two.kill)
+
+    time.sleep(1)
+    # Make sure the processes are running
+    ksft_is(one.poll(), None)
+    ksft_is(two.poll(), None)
+
+    rtstat1 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64']
+    time.sleep(2)
+    rtstat2 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64']
+    __stats_increase_sanely(rtstat1, rtstat2)
+    # defers will kill the loops
+
+
+@ksft_disruptive
+def procfs_downup_hammer(cfg) -> None:
+    """
+    Reading stats via procfs only holds the RCU lock, drivers often try
+    to sleep when reading the stats, or don't protect against races.
+    """
+    # Max out the queues, we'll flip between max and 1
+    channels = ethnl.channels_get({'header': {'dev-index': cfg.ifindex}})
+    if channels['combined-count'] == 0:
+        rx_type = 'rx'
+    else:
+        rx_type = 'combined'
+    cur_queue_cnt = channels[f'{rx_type}-count']
+    max_queue_cnt = channels[f'{rx_type}-max']
+
+    cmd(f"ethtool -L {cfg.ifname} {rx_type} {max_queue_cnt}")
+    defer(cmd, f"ethtool -L {cfg.ifname} {rx_type} {cur_queue_cnt}")
+
+    # Real test stats
+    stats = __run_inf_loop("cat /proc/net/dev")
+    defer(stats.kill)
+
+    ipset = f"ip link set dev {cfg.ifname}"
+    defer(ip, f"link set dev {cfg.ifname} up")
+    # The "echo -n 1" lets us count iterations below
+    updown = f"{ipset} down; sleep 0.05; {ipset} up; sleep 0.05; " + \
+             f"ethtool -L {cfg.ifname} {rx_type} 1; " + \
+             f"ethtool -L {cfg.ifname} {rx_type} {max_queue_cnt}; " + \
+              "echo -n 1"
+    updown = __run_inf_loop(updown)
+    kill_updown = defer(updown.kill)
+
+    time.sleep(1)
+    # Make sure the processes are running
+    ksft_is(stats.poll(), None)
+    ksft_is(updown.poll(), None)
+
+    rtstat1 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64']
+    # We're looking for crashes, give it extra time
+    time.sleep(9)
+    rtstat2 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64']
+    __stats_increase_sanely(rtstat1, rtstat2)
+
+    kill_updown.exec()
+    stdout, _ = updown.communicate(timeout=5)
+    ksft_pr("completed up/down cycles:", len(stdout.decode('utf-8')))
+
+
 def main() -> None:
     with NetDrvEnv(__file__, queue_count=100) as cfg:
         ksft_run([check_pause, check_fec, pkt_byte_sum, qstat_by_ifindex,
-                  check_down],
+                  check_down, procfs_hammer, procfs_downup_hammer],
                  args=(cfg, ))
     ksft_exit()
 
diff --git a/tools/testing/selftests/kselftest/ktap_helpers.sh b/tools/testing/selftests/kselftest/ktap_helpers.sh
index 79a125eb24c2..05a461890671 100644
--- a/tools/testing/selftests/kselftest/ktap_helpers.sh
+++ b/tools/testing/selftests/kselftest/ktap_helpers.sh
@@ -7,6 +7,7 @@
 KTAP_TESTNO=1
 KTAP_CNT_PASS=0
 KTAP_CNT_FAIL=0
+KTAP_CNT_XFAIL=0
 KTAP_CNT_SKIP=0
 
 KSFT_PASS=0
@@ -69,6 +70,16 @@ ktap_test_skip() {
 	KTAP_CNT_SKIP=$((KTAP_CNT_SKIP+1))
 }
 
+ktap_test_xfail() {
+	description="$1"
+
+	result="ok"
+	directive="XFAIL"
+	__ktap_test "$result" "$description" "$directive"
+
+	KTAP_CNT_XFAIL=$((KTAP_CNT_XFAIL+1))
+}
+
 ktap_test_fail() {
 	description="$1"
 
@@ -99,7 +110,7 @@ ktap_exit_fail_msg() {
 ktap_finished() {
 	ktap_print_totals
 
-	if [ $((KTAP_CNT_PASS + KTAP_CNT_SKIP)) -eq "$KSFT_NUM_TESTS" ]; then
+	if [ $((KTAP_CNT_PASS + KTAP_CNT_SKIP + KTAP_CNT_XFAIL)) -eq "$KSFT_NUM_TESTS" ]; then
 		exit "$KSFT_PASS"
 	else
 		exit "$KSFT_FAIL"
@@ -107,5 +118,5 @@ ktap_finished() {
 }
 
 ktap_print_totals() {
-	echo "# Totals: pass:$KTAP_CNT_PASS fail:$KTAP_CNT_FAIL xfail:0 xpass:0 skip:$KTAP_CNT_SKIP error:0"
+	echo "# Totals: pass:$KTAP_CNT_PASS fail:$KTAP_CNT_FAIL xfail:$KTAP_CNT_XFAIL xpass:0 skip:$KTAP_CNT_SKIP error:0"
 }
diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
index 0a0b55516028..c0c53451a16d 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -282,6 +282,24 @@ static void *mfd_assert_mmap_shared(int fd)
 	return p;
 }
 
+static void *mfd_assert_mmap_read_shared(int fd)
+{
+	void *p;
+
+	p = mmap(NULL,
+		 mfd_def_size,
+		 PROT_READ,
+		 MAP_SHARED,
+		 fd,
+		 0);
+	if (p == MAP_FAILED) {
+		printf("mmap() failed: %m\n");
+		abort();
+	}
+
+	return p;
+}
+
 static void *mfd_assert_mmap_private(int fd)
 {
 	void *p;
@@ -980,6 +998,30 @@ static void test_seal_future_write(void)
 	close(fd);
 }
 
+static void test_seal_write_map_read_shared(void)
+{
+	int fd;
+	void *p;
+
+	printf("%s SEAL-WRITE-MAP-READ\n", memfd_str);
+
+	fd = mfd_assert_new("kern_memfd_seal_write_map_read",
+			    mfd_def_size,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+
+	mfd_assert_add_seals(fd, F_SEAL_WRITE);
+	mfd_assert_has_seals(fd, F_SEAL_WRITE);
+
+	p = mfd_assert_mmap_read_shared(fd);
+
+	mfd_assert_read(fd);
+	mfd_assert_read_shared(fd);
+	mfd_fail_write(fd);
+
+	munmap(p, mfd_def_size);
+	close(fd);
+}
+
 /*
  * Test SEAL_SHRINK
  * Test whether SEAL_SHRINK actually prevents shrinking
@@ -1593,6 +1635,7 @@ int main(int argc, char **argv)
 
 	test_seal_write();
 	test_seal_future_write();
+	test_seal_write_map_read_shared();
 	test_seal_shrink();
 	test_seal_grow();
 	test_seal_resize();
diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py
index 477ae76de93d..3efe005436cd 100644
--- a/tools/testing/selftests/net/lib/py/ksft.py
+++ b/tools/testing/selftests/net/lib/py/ksft.py
@@ -71,6 +71,11 @@ def ksft_in(a, b, comment=""):
         _fail("Check failed", a, "not in", b, comment)
 
 
+def ksft_is(a, b, comment=""):
+    if a is not b:
+        _fail("Check failed", a, "is not", b, comment)
+
+
 def ksft_ge(a, b, comment=""):
     if a < b:
         _fail("Check failed", a, "<", b, comment)
diff --git a/tools/testing/selftests/net/lib/py/ynl.py b/tools/testing/selftests/net/lib/py/ynl.py
index 076a7e8dc3eb..ad1e36baee2a 100644
--- a/tools/testing/selftests/net/lib/py/ynl.py
+++ b/tools/testing/selftests/net/lib/py/ynl.py
@@ -13,14 +13,14 @@ try:
         SPEC_PATH = KSFT_DIR / "net/lib/specs"
 
         sys.path.append(tools_full_path.as_posix())
-        from net.lib.ynl.lib import YnlFamily, NlError
+        from net.lib.ynl.pyynl.lib import YnlFamily, NlError
     else:
         # Running in tree
         tools_full_path = KSRC / "tools"
         SPEC_PATH = KSRC / "Documentation/netlink/specs"
 
         sys.path.append(tools_full_path.as_posix())
-        from net.ynl.lib import YnlFamily, NlError
+        from net.ynl.pyynl.lib import YnlFamily, NlError
 except ModuleNotFoundError as e:
     ksft_pr("Failed importing `ynl` library from kernel sources")
     ksft_pr(str(e))
diff --git a/tools/testing/selftests/net/nl_netdev.py b/tools/testing/selftests/net/nl_netdev.py
index 93d9d914529b..93e8cb671c3d 100755
--- a/tools/testing/selftests/net/nl_netdev.py
+++ b/tools/testing/selftests/net/nl_netdev.py
@@ -18,6 +18,23 @@ def lo_check(nf) -> None:
     ksft_eq(len(lo_info['xdp-rx-metadata-features']), 0)
 
 
+def napi_list_check(nf) -> None:
+    with NetdevSimDev(queue_count=100) as nsimdev:
+        nsim = nsimdev.nsims[0]
+
+        ip(f"link set dev {nsim.ifname} up")
+
+        napis = nf.napi_get({'ifindex': nsim.ifindex}, dump=True)
+        ksft_eq(len(napis), 100)
+
+        for q in [50, 0, 99]:
+            for i in range(4):
+                nsim.dfs_write("queue_reset", f"{q} {i}")
+                napis = nf.napi_get({'ifindex': nsim.ifindex}, dump=True)
+                ksft_eq(len(napis), 100,
+                        comment=f"queue count after reset queue {q} mode {i}")
+
+
 def page_pool_check(nf) -> None:
     with NetdevSimDev() as nsimdev:
         nsim = nsimdev.nsims[0]
@@ -89,7 +106,7 @@ def page_pool_check(nf) -> None:
 
 def main() -> None:
     nf = NetdevFamily()
-    ksft_run([empty_check, lo_check, page_pool_check],
+    ksft_run([empty_check, lo_check, page_pool_check, napi_list_check],
              args=(nf, ))
     ksft_exit()
 
diff --git a/tools/testing/selftests/net/packetdrill/ksft_runner.sh b/tools/testing/selftests/net/packetdrill/ksft_runner.sh
index 4071c133f29e..ff989c325eef 100755
--- a/tools/testing/selftests/net/packetdrill/ksft_runner.sh
+++ b/tools/testing/selftests/net/packetdrill/ksft_runner.sh
@@ -23,7 +23,7 @@ if [ $# -ne 1 ]; then
 	ktap_exit_fail_msg "usage: $0 <script>"
 	exit "$KSFT_FAIL"
 fi
-script="$1"
+script="$(basename $1)"
 
 if [ -z "$(which packetdrill)" ]; then
 	ktap_skip_all "packetdrill not found in PATH"
@@ -31,16 +31,29 @@ if [ -z "$(which packetdrill)" ]; then
 fi
 
 declare -a optargs
+failfunc=ktap_test_fail
+
 if [[ -n "${KSFT_MACHINE_SLOW}" ]]; then
 	optargs+=('--tolerance_usecs=14000')
+
+	# xfail tests that are known flaky with dbg config, not fixable.
+	# still run them for coverage (and expect 100% pass without dbg).
+	declare -ar xfail_list=(
+		"tcp_fast_recovery_prr-ss.*.pkt"
+		"tcp_timestamping.*.pkt"
+		"tcp_user_timeout_user-timeout-probe.pkt"
+		"tcp_zerocopy_epoll_.*.pkt"
+	)
+	readonly xfail_regex="^($(printf '%s|' "${xfail_list[@]}"))$"
+	[[ "$script" =~ ${xfail_regex} ]] && failfunc=ktap_test_xfail
 fi
 
 ktap_print_header
 ktap_set_plan 2
 
-unshare -n packetdrill ${ipv4_args[@]} ${optargs[@]} $(basename $script) > /dev/null \
-	&& ktap_test_pass "ipv4" || ktap_test_fail "ipv4"
-unshare -n packetdrill ${ipv6_args[@]} ${optargs[@]} $(basename $script) > /dev/null \
-	&& ktap_test_pass "ipv6" || ktap_test_fail "ipv6"
+unshare -n packetdrill ${ipv4_args[@]} ${optargs[@]} $script > /dev/null \
+	&& ktap_test_pass "ipv4" || $failfunc "ipv4"
+unshare -n packetdrill ${ipv6_args[@]} ${optargs[@]} $script > /dev/null \
+	&& ktap_test_pass "ipv6" || $failfunc "ipv6"
 
 ktap_finished
diff --git a/tools/testing/selftests/net/ynl.mk b/tools/testing/selftests/net/ynl.mk
index d43afe243779..12e7cae251be 100644
--- a/tools/testing/selftests/net/ynl.mk
+++ b/tools/testing/selftests/net/ynl.mk
@@ -31,7 +31,8 @@ $(OUTPUT)/libynl.a: $(YNL_SPECS) $(OUTPUT)/.libynl-$(YNL_GENS_HASH).sig
 	$(Q)cp $(top_srcdir)/tools/net/ynl/libynl.a $(OUTPUT)/libynl.a
 
 EXTRA_CLEAN += \
-	$(top_srcdir)/tools/net/ynl/lib/__pycache__ \
+	$(top_srcdir)/tools/net/ynl/pyynl/__pycache__ \
+	$(top_srcdir)/tools/net/ynl/pyynl/lib/__pycache__ \
 	$(top_srcdir)/tools/net/ynl/lib/*.[ado] \
 	$(OUTPUT)/.libynl-*.sig \
 	$(OUTPUT)/libynl.a
diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c
index 37d9bf6fb745..6f4c3f5a1c5d 100644
--- a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c
+++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c
@@ -20,7 +20,7 @@ s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_select_cpu, struct task_struct *p,
 		 * If we dispatch to a bogus DSQ that will fall back to the
 		 * builtin global DSQ, we fail gracefully.
 		 */
-		scx_bpf_dispatch_vtime(p, 0xcafef00d, SCX_SLICE_DFL,
+		scx_bpf_dsq_insert_vtime(p, 0xcafef00d, SCX_SLICE_DFL,
 				       p->scx.dsq_vtime, 0);
 		return cpu;
 	}
diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c
index dffc97d9cdf1..e4a55027778f 100644
--- a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c
+++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c
@@ -17,8 +17,8 @@ s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_select_cpu, struct task_struct *p,
 
 	if (cpu >= 0) {
 		/* Shouldn't be allowed to vtime dispatch to a builtin DSQ. */
-		scx_bpf_dispatch_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL,
-				       p->scx.dsq_vtime, 0);
+		scx_bpf_dsq_insert_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL,
+					 p->scx.dsq_vtime, 0);
 		return cpu;
 	}
 
diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c
index 6a7db1502c29..fbda6bf54671 100644
--- a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c
+++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c
@@ -43,9 +43,12 @@ void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev)
 	if (!p)
 		return;
 
-	target = bpf_get_prandom_u32() % nr_cpus;
+	if (p->nr_cpus_allowed == nr_cpus)
+		target = bpf_get_prandom_u32() % nr_cpus;
+	else
+		target = scx_bpf_task_cpu(p);
 
-	scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0);
+	scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0);
 	bpf_task_release(p);
 }
 
diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.c b/tools/testing/selftests/sched_ext/dsp_local_on.c
index 472851b56854..0ff27e57fe43 100644
--- a/tools/testing/selftests/sched_ext/dsp_local_on.c
+++ b/tools/testing/selftests/sched_ext/dsp_local_on.c
@@ -34,9 +34,10 @@ static enum scx_test_status run(void *ctx)
 	/* Just sleeping is fine, plenty of scheduling events happening */
 	sleep(1);
 
-	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR));
 	bpf_link__destroy(link);
 
+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG));
+
 	return SCX_TEST_PASS;
 }
 
@@ -50,7 +51,7 @@ static void cleanup(void *ctx)
 struct scx_test dsp_local_on = {
 	.name = "dsp_local_on",
 	.description = "Verify we can directly dispatch tasks to a local DSQs "
-		       "from osp.dispatch()",
+		       "from ops.dispatch()",
 	.setup = setup,
 	.run = run,
 	.cleanup = cleanup,
diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c
index 1efb50d61040..a7cf868d5e31 100644
--- a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c
+++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c
@@ -31,7 +31,7 @@ void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p,
 	/* Can only call from ops.select_cpu() */
 	scx_bpf_select_cpu_dfl(p, 0, 0, &found);
 
-	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+	scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
 }
 
 SEC(".struct_ops.link")
diff --git a/tools/testing/selftests/sched_ext/exit.bpf.c b/tools/testing/selftests/sched_ext/exit.bpf.c
index d75d4faf07f6..4bc36182d3ff 100644
--- a/tools/testing/selftests/sched_ext/exit.bpf.c
+++ b/tools/testing/selftests/sched_ext/exit.bpf.c
@@ -33,7 +33,7 @@ void BPF_STRUCT_OPS(exit_enqueue, struct task_struct *p, u64 enq_flags)
 	if (exit_point == EXIT_ENQUEUE)
 		EXIT_CLEANLY();
 
-	scx_bpf_dispatch(p, DSQ_ID, SCX_SLICE_DFL, enq_flags);
+	scx_bpf_dsq_insert(p, DSQ_ID, SCX_SLICE_DFL, enq_flags);
 }
 
 void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p)
@@ -41,7 +41,7 @@ void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p)
 	if (exit_point == EXIT_DISPATCH)
 		EXIT_CLEANLY();
 
-	scx_bpf_consume(DSQ_ID);
+	scx_bpf_dsq_move_to_local(DSQ_ID);
 }
 
 void BPF_STRUCT_OPS(exit_enable, struct task_struct *p)
diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c
index 4d4cd8d966db..430f5e13bf55 100644
--- a/tools/testing/selftests/sched_ext/maximal.bpf.c
+++ b/tools/testing/selftests/sched_ext/maximal.bpf.c
@@ -12,6 +12,8 @@
 
 char _license[] SEC("license") = "GPL";
 
+#define DSQ_ID 0
+
 s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu,
 		   u64 wake_flags)
 {
@@ -20,7 +22,7 @@ s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu,
 
 void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags)
 {
-	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+	scx_bpf_dsq_insert(p, DSQ_ID, SCX_SLICE_DFL, enq_flags);
 }
 
 void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags)
@@ -28,7 +30,7 @@ void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags)
 
 void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev)
 {
-	scx_bpf_consume(SCX_DSQ_GLOBAL);
+	scx_bpf_dsq_move_to_local(DSQ_ID);
 }
 
 void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags)
@@ -123,7 +125,7 @@ void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
 
 s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init)
 {
-	return 0;
+	return scx_bpf_create_dsq(DSQ_ID, -1);
 }
 
 void BPF_STRUCT_OPS(maximal_exit, struct scx_exit_info *info)
diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c
index f171ac470970..13d0f5be788d 100644
--- a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c
@@ -30,7 +30,7 @@ void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p,
 	}
 	scx_bpf_put_idle_cpumask(idle_mask);
 
-	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+	scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
 }
 
 SEC(".struct_ops.link")
diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c
index 9efdbb7da928..815f1d5d61ac 100644
--- a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c
@@ -67,7 +67,7 @@ void BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_enqueue, struct task_struct *p,
 		saw_local = true;
 	}
 
-	scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags);
+	scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, enq_flags);
 }
 
 s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task,
diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c
index 59bfc4f36167..4bb99699e920 100644
--- a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c
@@ -29,7 +29,7 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p,
 	cpu = prev_cpu;
 
 dispatch:
-	scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, 0);
+	scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, 0);
 	return cpu;
 }
 
diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c
index 3bbd5fcdfb18..2a75de11b2cf 100644
--- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c
@@ -18,7 +18,7 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_select_cpu, struct task_struct *p
 		   s32 prev_cpu, u64 wake_flags)
 {
 	/* Dispatching to a random DSQ should fail. */
-	scx_bpf_dispatch(p, 0xcafef00d, SCX_SLICE_DFL, 0);
+	scx_bpf_dsq_insert(p, 0xcafef00d, SCX_SLICE_DFL, 0);
 
 	return prev_cpu;
 }
diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c
index 0fda57fe0ecf..99d075695c97 100644
--- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c
@@ -18,8 +18,8 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_select_cpu, struct task_struct *p
 		   s32 prev_cpu, u64 wake_flags)
 {
 	/* Dispatching twice in a row is disallowed. */
-	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
-	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+	scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+	scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
 
 	return prev_cpu;
 }
diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
index e6c67bcf5e6e..bfcb96cd4954 100644
--- a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
@@ -2,8 +2,8 @@
 /*
  * A scheduler that validates that enqueue flags are properly stored and
  * applied at dispatch time when a task is directly dispatched from
- * ops.select_cpu(). We validate this by using scx_bpf_dispatch_vtime(), and
- * making the test a very basic vtime scheduler.
+ * ops.select_cpu(). We validate this by using scx_bpf_dsq_insert_vtime(),
+ * and making the test a very basic vtime scheduler.
  *
  * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2024 David Vernet <dvernet@meta.com>
@@ -47,13 +47,13 @@ s32 BPF_STRUCT_OPS(select_cpu_vtime_select_cpu, struct task_struct *p,
 	cpu = prev_cpu;
 	scx_bpf_test_and_clear_cpu_idle(cpu);
 ddsp:
-	scx_bpf_dispatch_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0);
+	scx_bpf_dsq_insert_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0);
 	return cpu;
 }
 
 void BPF_STRUCT_OPS(select_cpu_vtime_dispatch, s32 cpu, struct task_struct *p)
 {
-	if (scx_bpf_consume(VTIME_DSQ))
+	if (scx_bpf_dsq_move_to_local(VTIME_DSQ))
 		consumed = true;
 }
 
diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json b/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json
index 996448afe31b..91d120548bf5 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json
@@ -78,10 +78,10 @@
         "setup": [
             "$TC qdisc add dev $DEV1 ingress"
         ],
-        "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 protocol ip flow map key dst rshift 0xff",
+        "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 protocol ip flow map key dst rshift 0x1f",
         "expExitCode": "0",
         "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 protocol ip prio 1 flow",
-        "matchPattern": "filter parent ffff: protocol ip pref 1 flow chain [0-9]+ handle 0x1 map keys dst rshift 255 baseclass",
+        "matchPattern": "filter parent ffff: protocol ip pref 1 flow chain [0-9]+ handle 0x1 map keys dst rshift 31 baseclass",
         "matchCount": "1",
         "teardown": [
             "$TC qdisc del dev $DEV1 ingress"