diff options
104 files changed, 5374 insertions, 438 deletions
diff --git a/Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst b/Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst index 88f508338c5f..d3fcf536d14e 100644 --- a/Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst +++ b/Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst @@ -12,6 +12,7 @@ Contents - `Overview`_ - `Drivers`_ - `Basic packet flow`_ +- `Devlink health reporters`_ Overview ======== @@ -157,3 +158,52 @@ Egress 3. The SQ descriptor ring is maintained in buffers allocated from SQ mapped pool of NPA block LF. 4. NIX block transmits the pkt on the designated channel. 5. NPC MCAM entries can be installed to divert pkt onto a different channel. + +Devlink health reporters +======================== + +NPA Reporters +------------- +The NPA reporters are responsible for reporting and recovering the following group of errors +1. GENERAL events + - Error due to operation of unmapped PF. + - Error due to disabled alloc/free for other HW blocks (NIX, SSO, TIM, DPI and AURA). +2. ERROR events + - Fault due to NPA_AQ_INST_S read or NPA_AQ_RES_S write. + - AQ Doorbell Error. +3. RAS events + - RAS Error Reporting for NPA_AQ_INST_S/NPA_AQ_RES_S. +4. RVU events + - Error due to unmapped slot. + +Sample Output +------------- +~# devlink health +pci/0002:01:00.0: + reporter hw_npa_intr + state healthy error 2872 recover 2872 last_dump_date 2020-12-10 last_dump_time 09:39:09 grace_period 0 auto_recover true auto_dump true + reporter hw_npa_gen + state healthy error 2872 recover 2872 last_dump_date 2020-12-11 last_dump_time 04:43:04 grace_period 0 auto_recover true auto_dump true + reporter hw_npa_err + state healthy error 2871 recover 2871 last_dump_date 2020-12-10 last_dump_time 09:39:17 grace_period 0 auto_recover true auto_dump true + reporter hw_npa_ras + state healthy error 0 recover 0 last_dump_date 2020-12-10 last_dump_time 09:32:40 grace_period 0 auto_recover true auto_dump true + +Each reporter dumps the + - Error Type + - Error Register value + - Reason in words + +For eg: +~# devlink health dump show pci/0002:01:00.0 reporter hw_npa_gen + NPA_AF_GENERAL: + NPA General Interrupt Reg : 1 + NIX0: free disabled RX +~# devlink health dump show pci/0002:01:00.0 reporter hw_npa_intr + NPA_AF_RVU: + NPA RVU Interrupt Reg : 1 + Unmap Slot Error +~# devlink health dump show pci/0002:01:00.0 reporter hw_npa_err + NPA_AF_ERR: + NPA Error Interrupt Reg : 4096 + AQ Doorbell Error diff --git a/MAINTAINERS b/MAINTAINERS index 5c1a6ba5ef26..85784a4c41df 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10517,6 +10517,14 @@ L: netdev@vger.kernel.org S: Maintained F: drivers/net/ethernet/marvell/mvneta.* +MARVELL MVPP2 ETHERNET DRIVER +M: Marcin Wojtas <mw@semihalf.com> +M: Russell King <linux@armlinux.org.uk> +L: netdev@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/net/marvell-pp2.txt +F: drivers/net/ethernet/marvell/mvpp2/ + MARVELL MWIFIEX WIRELESS DRIVER M: Amitkumar Karwar <amitkarwar@gmail.com> M: Ganapathi Bhat <ganapathi.bhat@nxp.com> diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index e7f68ac0c7e3..eafe6bedc692 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -1347,9 +1347,16 @@ static int mv88e6xxx_atu_setup(struct mv88e6xxx_chip *chip) if (err) return err; - err = mv88e6xxx_g1_atu_set_learn2all(chip, true); - if (err) - return err; + /* The chips that have a "learn2all" bit in Global1, ATU + * Control are precisely those whose port registers have a + * Message Port bit in Port Control 1 and hence implement + * ->port_setup_message_port. + */ + if (chip->info->ops->port_setup_message_port) { + err = mv88e6xxx_g1_atu_set_learn2all(chip, true); + if (err) + return err; + } return mv88e6xxx_g1_atu_set_age_time(chip, 300000); } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 7b444fcb6289..9ff79d5d14c4 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -2100,19 +2100,16 @@ static int bnxt_find_nvram_item(struct net_device *dev, u16 type, u16 ordinal, u16 ext, u16 *index, u32 *item_length, u32 *data_length); -static int bnxt_flash_nvram(struct net_device *dev, - u16 dir_type, - u16 dir_ordinal, - u16 dir_ext, - u16 dir_attr, - const u8 *data, - size_t data_len) +static int __bnxt_flash_nvram(struct net_device *dev, u16 dir_type, + u16 dir_ordinal, u16 dir_ext, u16 dir_attr, + u32 dir_item_len, const u8 *data, + size_t data_len) { struct bnxt *bp = netdev_priv(dev); int rc; struct hwrm_nvm_write_input req = {0}; dma_addr_t dma_handle; - u8 *kmem; + u8 *kmem = NULL; bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_NVM_WRITE, -1, -1); @@ -2120,26 +2117,42 @@ static int bnxt_flash_nvram(struct net_device *dev, req.dir_ordinal = cpu_to_le16(dir_ordinal); req.dir_ext = cpu_to_le16(dir_ext); req.dir_attr = cpu_to_le16(dir_attr); - req.dir_data_length = cpu_to_le32(data_len); + req.dir_item_length = cpu_to_le32(dir_item_len); + if (data_len && data) { + req.dir_data_length = cpu_to_le32(data_len); - kmem = dma_alloc_coherent(&bp->pdev->dev, data_len, &dma_handle, - GFP_KERNEL); - if (!kmem) { - netdev_err(dev, "dma_alloc_coherent failure, length = %u\n", - (unsigned)data_len); - return -ENOMEM; + kmem = dma_alloc_coherent(&bp->pdev->dev, data_len, &dma_handle, + GFP_KERNEL); + if (!kmem) + return -ENOMEM; + + memcpy(kmem, data, data_len); + req.host_src_addr = cpu_to_le64(dma_handle); } - memcpy(kmem, data, data_len); - req.host_src_addr = cpu_to_le64(dma_handle); - rc = hwrm_send_message(bp, &req, sizeof(req), FLASH_NVRAM_TIMEOUT); - dma_free_coherent(&bp->pdev->dev, data_len, kmem, dma_handle); + rc = _hwrm_send_message(bp, &req, sizeof(req), FLASH_NVRAM_TIMEOUT); + if (kmem) + dma_free_coherent(&bp->pdev->dev, data_len, kmem, dma_handle); if (rc == -EACCES) bnxt_print_admin_err(bp); return rc; } +static int bnxt_flash_nvram(struct net_device *dev, u16 dir_type, + u16 dir_ordinal, u16 dir_ext, u16 dir_attr, + const u8 *data, size_t data_len) +{ + struct bnxt *bp = netdev_priv(dev); + int rc; + + mutex_lock(&bp->hwrm_cmd_lock); + rc = __bnxt_flash_nvram(dev, dir_type, dir_ordinal, dir_ext, dir_attr, + 0, data, data_len); + mutex_unlock(&bp->hwrm_cmd_lock); + return rc; +} + static int bnxt_hwrm_firmware_reset(struct net_device *dev, u8 proc_type, u8 self_reset, u8 flags) { @@ -2419,90 +2432,141 @@ static int bnxt_flash_firmware_from_file(struct net_device *dev, return rc; } +#define BNXT_PKG_DMA_SIZE 0x40000 +#define BNXT_NVM_MORE_FLAG (cpu_to_le16(NVM_MODIFY_REQ_FLAGS_BATCH_MODE)) +#define BNXT_NVM_LAST_FLAG (cpu_to_le16(NVM_MODIFY_REQ_FLAGS_BATCH_LAST)) + int bnxt_flash_package_from_fw_obj(struct net_device *dev, const struct firmware *fw, u32 install_type) { - struct bnxt *bp = netdev_priv(dev); - struct hwrm_nvm_install_update_output *resp = bp->hwrm_cmd_resp_addr; struct hwrm_nvm_install_update_input install = {0}; + struct hwrm_nvm_install_update_output resp = {0}; + struct hwrm_nvm_modify_input modify = {0}; + struct bnxt *bp = netdev_priv(dev); + bool defrag_attempted = false; + dma_addr_t dma_handle; + u8 *kmem = NULL; + u32 modify_len; u32 item_len; int rc = 0; u16 index; bnxt_hwrm_fw_set_time(bp); - rc = bnxt_find_nvram_item(dev, BNX_DIR_TYPE_UPDATE, - BNX_DIR_ORDINAL_FIRST, BNX_DIR_EXT_NONE, - &index, &item_len, NULL); - if (rc) { - netdev_err(dev, "PKG update area not created in nvram\n"); - return rc; + bnxt_hwrm_cmd_hdr_init(bp, &modify, HWRM_NVM_MODIFY, -1, -1); + + /* Try allocating a large DMA buffer first. Older fw will + * cause excessive NVRAM erases when using small blocks. + */ + modify_len = roundup_pow_of_two(fw->size); + modify_len = min_t(u32, modify_len, BNXT_PKG_DMA_SIZE); + while (1) { + kmem = dma_alloc_coherent(&bp->pdev->dev, modify_len, + &dma_handle, GFP_KERNEL); + if (!kmem && modify_len > PAGE_SIZE) + modify_len /= 2; + else + break; } + if (!kmem) + return -ENOMEM; - if (fw->size > item_len) { - netdev_err(dev, "PKG insufficient update area in nvram: %lu\n", - (unsigned long)fw->size); - rc = -EFBIG; - } else { - dma_addr_t dma_handle; - u8 *kmem; - struct hwrm_nvm_modify_input modify = {0}; + modify.host_src_addr = cpu_to_le64(dma_handle); - bnxt_hwrm_cmd_hdr_init(bp, &modify, HWRM_NVM_MODIFY, -1, -1); + bnxt_hwrm_cmd_hdr_init(bp, &install, HWRM_NVM_INSTALL_UPDATE, -1, -1); + if ((install_type & 0xffff) == 0) + install_type >>= 16; + install.install_type = cpu_to_le32(install_type); + + do { + u32 copied = 0, len = modify_len; + + rc = bnxt_find_nvram_item(dev, BNX_DIR_TYPE_UPDATE, + BNX_DIR_ORDINAL_FIRST, + BNX_DIR_EXT_NONE, + &index, &item_len, NULL); + if (rc) { + netdev_err(dev, "PKG update area not created in nvram\n"); + break; + } + if (fw->size > item_len) { + netdev_err(dev, "PKG insufficient update area in nvram: %lu\n", + (unsigned long)fw->size); + rc = -EFBIG; + break; + } modify.dir_idx = cpu_to_le16(index); - modify.len = cpu_to_le32(fw->size); - kmem = dma_alloc_coherent(&bp->pdev->dev, fw->size, - &dma_handle, GFP_KERNEL); - if (!kmem) { - netdev_err(dev, - "dma_alloc_coherent failure, length = %u\n", - (unsigned int)fw->size); - rc = -ENOMEM; - } else { - memcpy(kmem, fw->data, fw->size); - modify.host_src_addr = cpu_to_le64(dma_handle); + if (fw->size > modify_len) + modify.flags = BNXT_NVM_MORE_FLAG; + while (copied < fw->size) { + u32 balance = fw->size - copied; + if (balance <= modify_len) { + len = balance; + if (copied) + modify.flags |= BNXT_NVM_LAST_FLAG; + } + memcpy(kmem, fw->data + copied, len); + modify.len = cpu_to_le32(len); + modify.offset = cpu_to_le32(copied); rc = hwrm_send_message(bp, &modify, sizeof(modify), FLASH_PACKAGE_TIMEOUT); - dma_free_coherent(&bp->pdev->dev, fw->size, kmem, - dma_handle); + if (rc) + goto pkg_abort; + copied += len; } - } - if (rc) - goto err_exit; - - if ((install_type & 0xffff) == 0) - install_type >>= 16; - bnxt_hwrm_cmd_hdr_init(bp, &install, HWRM_NVM_INSTALL_UPDATE, -1, -1); - install.install_type = cpu_to_le32(install_type); + mutex_lock(&bp->hwrm_cmd_lock); + rc = _hwrm_send_message_silent(bp, &install, sizeof(install), + INSTALL_PACKAGE_TIMEOUT); + memcpy(&resp, bp->hwrm_cmd_resp_addr, sizeof(resp)); - mutex_lock(&bp->hwrm_cmd_lock); - rc = _hwrm_send_message(bp, &install, sizeof(install), - INSTALL_PACKAGE_TIMEOUT); - if (rc) { - u8 error_code = ((struct hwrm_err_output *)resp)->cmd_err; + if (defrag_attempted) { + /* We have tried to defragment already in the previous + * iteration. Return with the result for INSTALL_UPDATE + */ + mutex_unlock(&bp->hwrm_cmd_lock); + break; + } - if (resp->error_code && error_code == + if (rc && ((struct hwrm_err_output *)&resp)->cmd_err == NVM_INSTALL_UPDATE_CMD_ERR_CODE_FRAG_ERR) { - install.flags |= cpu_to_le16( - NVM_INSTALL_UPDATE_REQ_FLAGS_ALLOWED_TO_DEFRAG); - rc = _hwrm_send_message(bp, &install, sizeof(install), - INSTALL_PACKAGE_TIMEOUT); + install.flags |= + cpu_to_le16(NVM_INSTALL_UPDATE_REQ_FLAGS_ALLOWED_TO_DEFRAG); + + rc = _hwrm_send_message_silent(bp, &install, + sizeof(install), + INSTALL_PACKAGE_TIMEOUT); + memcpy(&resp, bp->hwrm_cmd_resp_addr, sizeof(resp)); + + if (rc && ((struct hwrm_err_output *)&resp)->cmd_err == + NVM_INSTALL_UPDATE_CMD_ERR_CODE_NO_SPACE) { + /* FW has cleared NVM area, driver will create + * UPDATE directory and try the flash again + */ + defrag_attempted = true; + rc = __bnxt_flash_nvram(bp->dev, + BNX_DIR_TYPE_UPDATE, + BNX_DIR_ORDINAL_FIRST, + 0, 0, item_len, NULL, + 0); + } else if (rc) { + netdev_err(dev, "HWRM_NVM_INSTALL_UPDATE failure rc :%x\n", rc); + } + } else if (rc) { + netdev_err(dev, "HWRM_NVM_INSTALL_UPDATE failure rc :%x\n", rc); } - if (rc) - goto flash_pkg_exit; - } + mutex_unlock(&bp->hwrm_cmd_lock); + } while (defrag_attempted && !rc); - if (resp->result) { +pkg_abort: + dma_free_coherent(&bp->pdev->dev, modify_len, kmem, dma_handle); + if (resp.result) { netdev_err(dev, "PKG install error = %d, problem_item = %d\n", - (s8)resp->result, (int)resp->problem_item); + (s8)resp.result, (int)resp.problem_item); rc = -ENOPKG; } -flash_pkg_exit: - mutex_unlock(&bp->hwrm_cmd_lock); -err_exit: if (rc == -EACCES) bnxt_print_admin_err(bp); return rc; diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 797886524054..39757b4cf8f4 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -570,12 +570,6 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget) u16 vlan_tag = 0; u8 rx_ptype; - if (cleaned_count >= ICE_RX_BUF_WRITE) { - failure |= ice_alloc_rx_bufs_zc(rx_ring, - cleaned_count); - cleaned_count = 0; - } - rx_desc = ICE_RX_DESC(rx_ring, rx_ring->next_to_clean); stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_DD_S); @@ -642,6 +636,9 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget) ice_receive_skb(rx_ring, skb, vlan_tag); } + if (cleaned_count >= ICE_RX_BUF_WRITE) + failure = !ice_alloc_rx_bufs_zc(rx_ring, cleaned_count); + ice_finalize_xdp_rx(rx_ring, xdp_xmit); ice_update_rx_ring_stats(rx_ring, total_rx_packets, total_rx_bytes); diff --git a/drivers/net/ethernet/marvell/octeontx2/Kconfig b/drivers/net/ethernet/marvell/octeontx2/Kconfig index 543a1d047567..16caa02095fe 100644 --- a/drivers/net/ethernet/marvell/octeontx2/Kconfig +++ b/drivers/net/ethernet/marvell/octeontx2/Kconfig @@ -9,6 +9,7 @@ config OCTEONTX2_MBOX config OCTEONTX2_AF tristate "Marvell OcteonTX2 RVU Admin Function driver" select OCTEONTX2_MBOX + select NET_DEVLINK depends on (64BIT && COMPILE_TEST) || ARM64 depends on PCI help diff --git a/drivers/net/ethernet/marvell/octeontx2/af/Makefile b/drivers/net/ethernet/marvell/octeontx2/af/Makefile index 7100d1dd856e..eb535c98ca38 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/Makefile +++ b/drivers/net/ethernet/marvell/octeontx2/af/Makefile @@ -10,4 +10,4 @@ obj-$(CONFIG_OCTEONTX2_AF) += octeontx2_af.o octeontx2_mbox-y := mbox.o rvu_trace.o octeontx2_af-y := cgx.o rvu.o rvu_cgx.o rvu_npa.o rvu_nix.o \ rvu_reg.o rvu_npc.o rvu_debugfs.o ptp.o rvu_npc_fs.o \ - rvu_cpt.o + rvu_cpt.o rvu_devlink.o diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c index 9f901c0edcbb..e8fd712860a1 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c @@ -2826,17 +2826,23 @@ static int rvu_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (err) goto err_flr; + err = rvu_register_dl(rvu); + if (err) + goto err_irq; + rvu_setup_rvum_blk_revid(rvu); /* Enable AF's VFs (if any) */ err = rvu_enable_sriov(rvu); if (err) - goto err_irq; + goto err_dl; /* Initialize debugfs */ rvu_dbg_init(rvu); return 0; +err_dl: + rvu_unregister_dl(rvu); err_irq: rvu_unregister_interrupts(rvu); err_flr: @@ -2868,6 +2874,7 @@ static void rvu_remove(struct pci_dev *pdev) rvu_dbg_exit(rvu); rvu_unregister_interrupts(rvu); + rvu_unregister_dl(rvu); rvu_flr_wq_destroy(rvu); rvu_cgx_exit(rvu); rvu_fwdata_exit(rvu); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h index b6c0977499ab..b1a6ecfd563e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h @@ -12,7 +12,10 @@ #define RVU_H #include <linux/pci.h> +#include <net/devlink.h> + #include "rvu_struct.h" +#include "rvu_devlink.h" #include "common.h" #include "mbox.h" #include "npc.h" @@ -422,6 +425,7 @@ struct rvu { #ifdef CONFIG_DEBUG_FS struct rvu_debugfs rvu_dbg; #endif + struct rvu_devlink *rvu_dl; }; static inline void rvu_write64(struct rvu *rvu, u64 block, u64 offset, u64 val) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c new file mode 100644 index 000000000000..3f9d0ab6d5ae --- /dev/null +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c @@ -0,0 +1,770 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Marvell OcteonTx2 RVU Devlink + * + * Copyright (C) 2020 Marvell. + * + */ + +#include<linux/bitfield.h> + +#include "rvu.h" +#include "rvu_reg.h" +#include "rvu_struct.h" + +#define DRV_NAME "octeontx2-af" + +static int rvu_report_pair_start(struct devlink_fmsg *fmsg, const char *name) +{ + int err; + + err = devlink_fmsg_pair_nest_start(fmsg, name); + if (err) + return err; + + return devlink_fmsg_obj_nest_start(fmsg); +} + +static int rvu_report_pair_end(struct devlink_fmsg *fmsg) +{ + int err; + + err = devlink_fmsg_obj_nest_end(fmsg); + if (err) + return err; + + return devlink_fmsg_pair_nest_end(fmsg); +} + +static bool rvu_common_request_irq(struct rvu *rvu, int offset, + const char *name, irq_handler_t fn) +{ + struct rvu_devlink *rvu_dl = rvu->rvu_dl; + int rc; + + sprintf(&rvu->irq_name[offset * NAME_SIZE], name); + rc = request_irq(pci_irq_vector(rvu->pdev, offset), fn, 0, + &rvu->irq_name[offset * NAME_SIZE], rvu_dl); + if (rc) + dev_warn(rvu->dev, "Failed to register %s irq\n", name); + else + rvu->irq_allocated[offset] = true; + + return rvu->irq_allocated[offset]; +} + +static void rvu_npa_intr_work(struct work_struct *work) +{ + struct rvu_npa_health_reporters *rvu_npa_health_reporter; + + rvu_npa_health_reporter = container_of(work, struct rvu_npa_health_reporters, intr_work); + devlink_health_report(rvu_npa_health_reporter->rvu_hw_npa_intr_reporter, + "NPA_AF_RVU Error", + rvu_npa_health_reporter->npa_event_ctx); +} + +static irqreturn_t rvu_npa_af_rvu_intr_handler(int irq, void *rvu_irq) +{ + struct rvu_npa_event_ctx *npa_event_context; + struct rvu_devlink *rvu_dl = rvu_irq; + struct rvu *rvu; + int blkaddr; + u64 intr; + + rvu = rvu_dl->rvu; + blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPA, 0); + if (blkaddr < 0) + return IRQ_NONE; + + npa_event_context = rvu_dl->rvu_npa_health_reporter->npa_event_ctx; + intr = rvu_read64(rvu, blkaddr, NPA_AF_RVU_INT); + npa_event_context->npa_af_rvu_int = intr; + + /* Clear interrupts */ + rvu_write64(rvu, blkaddr, NPA_AF_RVU_INT, intr); + rvu_write64(rvu, blkaddr, NPA_AF_RVU_INT_ENA_W1C, ~0ULL); + queue_work(rvu_dl->devlink_wq, &rvu_dl->rvu_npa_health_reporter->intr_work); + + return IRQ_HANDLED; +} + +static void rvu_npa_gen_work(struct work_struct *work) +{ + struct rvu_npa_health_reporters *rvu_npa_health_reporter; + + rvu_npa_health_reporter = container_of(work, struct rvu_npa_health_reporters, gen_work); + devlink_health_report(rvu_npa_health_reporter->rvu_hw_npa_gen_reporter, + "NPA_AF_GEN Error", + rvu_npa_health_reporter->npa_event_ctx); +} + +static irqreturn_t rvu_npa_af_gen_intr_handler(int irq, void *rvu_irq) +{ + struct rvu_npa_event_ctx *npa_event_context; + struct rvu_devlink *rvu_dl = rvu_irq; + struct rvu *rvu; + int blkaddr; + u64 intr; + + rvu = rvu_dl->rvu; + blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPA, 0); + if (blkaddr < 0) + return IRQ_NONE; + + npa_event_context = rvu_dl->rvu_npa_health_reporter->npa_event_ctx; + intr = rvu_read64(rvu, blkaddr, NPA_AF_GEN_INT); + npa_event_context->npa_af_rvu_gen = intr; + + /* Clear interrupts */ + rvu_write64(rvu, blkaddr, NPA_AF_GEN_INT, intr); + rvu_write64(rvu, blkaddr, NPA_AF_GEN_INT_ENA_W1C, ~0ULL); + queue_work(rvu_dl->devlink_wq, &rvu_dl->rvu_npa_health_reporter->gen_work); + + return IRQ_HANDLED; +} + +static void rvu_npa_err_work(struct work_struct *work) +{ + struct rvu_npa_health_reporters *rvu_npa_health_reporter; + + rvu_npa_health_reporter = container_of(work, struct rvu_npa_health_reporters, err_work); + devlink_health_report(rvu_npa_health_reporter->rvu_hw_npa_err_reporter, + "NPA_AF_ERR Error", + rvu_npa_health_reporter->npa_event_ctx); +} + +static irqreturn_t rvu_npa_af_err_intr_handler(int irq, void *rvu_irq) +{ + struct rvu_npa_event_ctx *npa_event_context; + struct rvu_devlink *rvu_dl = rvu_irq; + struct rvu *rvu; + int blkaddr; + u64 intr; + + rvu = rvu_dl->rvu; + blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPA, 0); + if (blkaddr < 0) + return IRQ_NONE; + npa_event_context = rvu_dl->rvu_npa_health_reporter->npa_event_ctx; + intr = rvu_read64(rvu, blkaddr, NPA_AF_ERR_INT); + npa_event_context->npa_af_rvu_err = intr; + + /* Clear interrupts */ + rvu_write64(rvu, blkaddr, NPA_AF_ERR_INT, intr); + rvu_write64(rvu, blkaddr, NPA_AF_ERR_INT_ENA_W1C, ~0ULL); + queue_work(rvu_dl->devlink_wq, &rvu_dl->rvu_npa_health_reporter->err_work); + + return IRQ_HANDLED; +} + +static void rvu_npa_ras_work(struct work_struct *work) +{ + struct rvu_npa_health_reporters *rvu_npa_health_reporter; + + rvu_npa_health_reporter = container_of(work, struct rvu_npa_health_reporters, ras_work); + devlink_health_report(rvu_npa_health_reporter->rvu_hw_npa_ras_reporter, + "HW NPA_AF_RAS Error reported", + rvu_npa_health_reporter->npa_event_ctx); +} + +static irqreturn_t rvu_npa_af_ras_intr_handler(int irq, void *rvu_irq) +{ + struct rvu_npa_event_ctx *npa_event_context; + struct rvu_devlink *rvu_dl = rvu_irq; + struct rvu *rvu; + int blkaddr; + u64 intr; + + rvu = rvu_dl->rvu; + blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPA, 0); + if (blkaddr < 0) + return IRQ_NONE; + + npa_event_context = rvu_dl->rvu_npa_health_reporter->npa_event_ctx; + intr = rvu_read64(rvu, blkaddr, NPA_AF_RAS); + npa_event_context->npa_af_rvu_ras = intr; + + /* Clear interrupts */ + rvu_write64(rvu, blkaddr, NPA_AF_RAS, intr); + rvu_write64(rvu, blkaddr, NPA_AF_RAS_ENA_W1C, ~0ULL); + queue_work(rvu_dl->devlink_wq, &rvu_dl->rvu_npa_health_reporter->ras_work); + + return IRQ_HANDLED; +} + +static void rvu_npa_unregister_interrupts(struct rvu *rvu) +{ + struct rvu_devlink *rvu_dl = rvu->rvu_dl; + int i, offs, blkaddr; + u64 reg; + + blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPA, 0); + if (blkaddr < 0) + return; + + reg = rvu_read64(rvu, blkaddr, NPA_PRIV_AF_INT_CFG); + offs = reg & 0x3FF; + + rvu_write64(rvu, blkaddr, NPA_AF_RVU_INT_ENA_W1C, ~0ULL); + rvu_write64(rvu, blkaddr, NPA_AF_GEN_INT_ENA_W1C, ~0ULL); + rvu_write64(rvu, blkaddr, NPA_AF_ERR_INT_ENA_W1C, ~0ULL); + rvu_write64(rvu, blkaddr, NPA_AF_RAS_ENA_W1C, ~0ULL); + + for (i = 0; i < NPA_AF_INT_VEC_CNT; i++) + if (rvu->irq_allocated[offs + i]) { + free_irq(pci_irq_vector(rvu->pdev, offs + i), rvu_dl); + rvu->irq_allocated[offs + i] = false; + } +} + +static int rvu_npa_register_interrupts(struct rvu *rvu) +{ + int blkaddr, base; + bool rc; + + blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPA, 0); + if (blkaddr < 0) + return blkaddr; + + /* Get NPA AF MSIX vectors offset. */ + base = rvu_read64(rvu, blkaddr, NPA_PRIV_AF_INT_CFG) & 0x3ff; + if (!base) { + dev_warn(rvu->dev, + "Failed to get NPA_AF_INT vector offsets\n"); + return 0; + } + + /* Register and enable NPA_AF_RVU_INT interrupt */ + rc = rvu_common_request_irq(rvu, base + NPA_AF_INT_VEC_RVU, + "NPA_AF_RVU_INT", + rvu_npa_af_rvu_intr_handler); + if (!rc) + goto err; + rvu_write64(rvu, blkaddr, NPA_AF_RVU_INT_ENA_W1S, ~0ULL); + + /* Register and enable NPA_AF_GEN_INT interrupt */ + rc = rvu_common_request_irq(rvu, base + NPA_AF_INT_VEC_GEN, + "NPA_AF_RVU_GEN", + rvu_npa_af_gen_intr_handler); + if (!rc) + goto err; + rvu_write64(rvu, blkaddr, NPA_AF_GEN_INT_ENA_W1S, ~0ULL); + + /* Register and enable NPA_AF_ERR_INT interrupt */ + rc = rvu_common_request_irq(rvu, base + NPA_AF_INT_VEC_AF_ERR, + "NPA_AF_ERR_INT", + rvu_npa_af_err_intr_handler); + if (!rc) + goto err; + rvu_write64(rvu, blkaddr, NPA_AF_ERR_INT_ENA_W1S, ~0ULL); + + /* Register and enable NPA_AF_RAS interrupt */ + rc = rvu_common_request_irq(rvu, base + NPA_AF_INT_VEC_POISON, + "NPA_AF_RAS", + rvu_npa_af_ras_intr_handler); + if (!rc) + goto err; + rvu_write64(rvu, blkaddr, NPA_AF_RAS_ENA_W1S, ~0ULL); + + return 0; +err: + rvu_npa_unregister_interrupts(rvu); + return rc; +} + +static int rvu_npa_report_show(struct devlink_fmsg *fmsg, void *ctx, + enum npa_af_rvu_health health_reporter) +{ + struct rvu_npa_event_ctx *npa_event_context; + unsigned int intr_val, alloc_dis, free_dis; + int err; + + npa_event_context = ctx; + switch (health_reporter) { + case NPA_AF_RVU_GEN: + intr_val = npa_event_context->npa_af_rvu_gen; + err = rvu_report_pair_start(fmsg, "NPA_AF_GENERAL"); + if (err) + return err; + err = devlink_fmsg_u64_pair_put(fmsg, "\tNPA General Interrupt Reg ", + npa_event_context->npa_af_rvu_gen); + if (err) + return err; + if (intr_val & BIT_ULL(32)) { + err = devlink_fmsg_string_put(fmsg, "\n\tUnmap PF Error"); + if (err) + return err; + } + + free_dis = FIELD_GET(GENMASK(15, 0), intr_val); + if (free_dis & BIT(NPA_INPQ_NIX0_RX)) { + err = devlink_fmsg_string_put(fmsg, "\n\tNIX0: free disabled RX"); + if (err) + return err; + } + if (free_dis & BIT(NPA_INPQ_NIX0_TX)) { + err = devlink_fmsg_string_put(fmsg, "\n\tNIX0:free disabled TX"); + if (err) + return err; + } + if (free_dis & BIT(NPA_INPQ_NIX1_RX)) { + err = devlink_fmsg_string_put(fmsg, "\n\tNIX1: free disabled RX"); + if (err) + return err; + } + if (free_dis & BIT(NPA_INPQ_NIX1_TX)) { + err = devlink_fmsg_string_put(fmsg, "\n\tNIX1:free disabled TX"); + if (err) + return err; + } + if (free_dis & BIT(NPA_INPQ_SSO)) { + err = devlink_fmsg_string_put(fmsg, "\n\tFree Disabled for SSO"); + if (err) + return err; + } + if (free_dis & BIT(NPA_INPQ_TIM)) { + err = devlink_fmsg_string_put(fmsg, "\n\tFree Disabled for TIM"); + if (err) + return err; + } + if (free_dis & BIT(NPA_INPQ_DPI)) { + err = devlink_fmsg_string_put(fmsg, "\n\tFree Disabled for DPI"); + if (err) + return err; + } + if (free_dis & BIT(NPA_INPQ_AURA_OP)) { + err = devlink_fmsg_string_put(fmsg, "\n\tFree Disabled for AURA"); + if (err) + return err; + } + + alloc_dis = FIELD_GET(GENMASK(31, 16), intr_val); + if (alloc_dis & BIT(NPA_INPQ_NIX0_RX)) { + err = devlink_fmsg_string_put(fmsg, "\n\tNIX0: alloc disabled RX"); + if (err) + return err; + } + if (alloc_dis & BIT(NPA_INPQ_NIX0_TX)) { + err = devlink_fmsg_string_put(fmsg, "\n\tNIX0:alloc disabled TX"); + if (err) + return err; + } + if (alloc_dis & BIT(NPA_INPQ_NIX1_RX)) { + err = devlink_fmsg_string_put(fmsg, "\n\tNIX1: alloc disabled RX"); + if (err) + return err; + } + if (alloc_dis & BIT(NPA_INPQ_NIX1_TX)) { + err = devlink_fmsg_string_put(fmsg, "\n\tNIX1:alloc disabled TX"); + if (err) + return err; + } + if (alloc_dis & BIT(NPA_INPQ_SSO)) { + err = devlink_fmsg_string_put(fmsg, "\n\tAlloc Disabled for SSO"); + if (err) + return err; + } + if (alloc_dis & BIT(NPA_INPQ_TIM)) { + err = devlink_fmsg_string_put(fmsg, "\n\tAlloc Disabled for TIM"); + if (err) + return err; + } + if (alloc_dis & BIT(NPA_INPQ_DPI)) { + err = devlink_fmsg_string_put(fmsg, "\n\tAlloc Disabled for DPI"); + if (err) + return err; + } + if (alloc_dis & BIT(NPA_INPQ_AURA_OP)) { + err = devlink_fmsg_string_put(fmsg, "\n\tAlloc Disabled for AURA"); + if (err) + return err; + } + err = rvu_report_pair_end(fmsg); + if (err) + return err; + break; + case NPA_AF_RVU_ERR: + err = rvu_report_pair_start(fmsg, "NPA_AF_ERR"); + if (err) + return err; + err = devlink_fmsg_u64_pair_put(fmsg, "\tNPA Error Interrupt Reg ", + npa_event_context->npa_af_rvu_err); + if (err) + return err; + + if (npa_event_context->npa_af_rvu_err & BIT_ULL(14)) { + err = devlink_fmsg_string_put(fmsg, "\n\tFault on NPA_AQ_INST_S read"); + if (err) + return err; + } + if (npa_event_context->npa_af_rvu_err & BIT_ULL(13)) { + err = devlink_fmsg_string_put(fmsg, "\n\tFault on NPA_AQ_RES_S write"); + if (err) + return err; + } + if (npa_event_context->npa_af_rvu_err & BIT_ULL(12)) { + err = devlink_fmsg_string_put(fmsg, "\n\tAQ Doorbell Error"); + if (err) + return err; + } + err = rvu_report_pair_end(fmsg); + if (err) + return err; + break; + case NPA_AF_RVU_RAS: + err = rvu_report_pair_start(fmsg, "NPA_AF_RVU_RAS"); + if (err) + return err; + err = devlink_fmsg_u64_pair_put(fmsg, "\tNPA RAS Interrupt Reg ", + npa_event_context->npa_af_rvu_ras); + if (err) + return err; + if (npa_event_context->npa_af_rvu_ras & BIT_ULL(34)) { + err = devlink_fmsg_string_put(fmsg, "\n\tPoison data on NPA_AQ_INST_S"); + if (err) + return err; + } + if (npa_event_context->npa_af_rvu_ras & BIT_ULL(33)) { + err = devlink_fmsg_string_put(fmsg, "\n\tPoison data on NPA_AQ_RES_S"); + if (err) + return err; + } + if (npa_event_context->npa_af_rvu_ras & BIT_ULL(32)) { + err = devlink_fmsg_string_put(fmsg, "\n\tPoison data on HW context"); + if (err) + return err; + } + err = rvu_report_pair_end(fmsg); + if (err) + return err; + break; + case NPA_AF_RVU_INTR: + err = rvu_report_pair_start(fmsg, "NPA_AF_RVU"); + if (err) + return err; + err = devlink_fmsg_u64_pair_put(fmsg, "\tNPA RVU Interrupt Reg ", + npa_event_context->npa_af_rvu_int); + if (err) + return err; + if (npa_event_context->npa_af_rvu_int & BIT_ULL(0)) { + err = devlink_fmsg_string_put(fmsg, "\n\tUnmap Slot Error"); + if (err) + return err; + } + return rvu_report_pair_end(fmsg); + default: + return -EINVAL; + } + + return 0; +} + +static int rvu_hw_npa_intr_dump(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, void *ctx, + struct netlink_ext_ack *netlink_extack) +{ + struct rvu *rvu = devlink_health_reporter_priv(reporter); + struct rvu_devlink *rvu_dl = rvu->rvu_dl; + struct rvu_npa_event_ctx *npa_ctx; + + npa_ctx = rvu_dl->rvu_npa_health_reporter->npa_event_ctx; + + return ctx ? rvu_npa_report_show(fmsg, ctx, NPA_AF_RVU_INTR) : + rvu_npa_report_show(fmsg, npa_ctx, NPA_AF_RVU_INTR); +} + +static int rvu_hw_npa_intr_recover(struct devlink_health_reporter *reporter, + void *ctx, struct netlink_ext_ack *netlink_extack) +{ + struct rvu *rvu = devlink_health_reporter_priv(reporter); + struct rvu_npa_event_ctx *npa_event_ctx = ctx; + int blkaddr; + + blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPA, 0); + if (blkaddr < 0) + return blkaddr; + + if (npa_event_ctx->npa_af_rvu_int) + rvu_write64(rvu, blkaddr, NPA_AF_RVU_INT_ENA_W1S, ~0ULL); + + return 0; +} + +static int rvu_hw_npa_gen_dump(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, void *ctx, + struct netlink_ext_ack *netlink_extack) +{ + struct rvu *rvu = devlink_health_reporter_priv(reporter); + struct rvu_devlink *rvu_dl = rvu->rvu_dl; + struct rvu_npa_event_ctx *npa_ctx; + + npa_ctx = rvu_dl->rvu_npa_health_reporter->npa_event_ctx; + + return ctx ? rvu_npa_report_show(fmsg, ctx, NPA_AF_RVU_GEN) : + rvu_npa_report_show(fmsg, npa_ctx, NPA_AF_RVU_GEN); +} + +static int rvu_hw_npa_gen_recover(struct devlink_health_reporter *reporter, + void *ctx, struct netlink_ext_ack *netlink_extack) +{ + struct rvu *rvu = devlink_health_reporter_priv(reporter); + struct rvu_npa_event_ctx *npa_event_ctx = ctx; + int blkaddr; + + blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPA, 0); + if (blkaddr < 0) + return blkaddr; + + if (npa_event_ctx->npa_af_rvu_gen) + rvu_write64(rvu, blkaddr, NPA_AF_GEN_INT_ENA_W1S, ~0ULL); + + return 0; +} + +static int rvu_hw_npa_err_dump(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, void *ctx, + struct netlink_ext_ack *netlink_extack) +{ + struct rvu *rvu = devlink_health_reporter_priv(reporter); + struct rvu_devlink *rvu_dl = rvu->rvu_dl; + struct rvu_npa_event_ctx *npa_ctx; + + npa_ctx = rvu_dl->rvu_npa_health_reporter->npa_event_ctx; + + return ctx ? rvu_npa_report_show(fmsg, ctx, NPA_AF_RVU_ERR) : + rvu_npa_report_show(fmsg, npa_ctx, NPA_AF_RVU_ERR); +} + +static int rvu_hw_npa_err_recover(struct devlink_health_reporter *reporter, + void *ctx, struct netlink_ext_ack *netlink_extack) +{ + struct rvu *rvu = devlink_health_reporter_priv(reporter); + struct rvu_npa_event_ctx *npa_event_ctx = ctx; + int blkaddr; + + blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPA, 0); + if (blkaddr < 0) + return blkaddr; + + if (npa_event_ctx->npa_af_rvu_err) + rvu_write64(rvu, blkaddr, NPA_AF_ERR_INT_ENA_W1S, ~0ULL); + + return 0; +} + +static int rvu_hw_npa_ras_dump(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, void *ctx, + struct netlink_ext_ack *netlink_extack) +{ + struct rvu *rvu = devlink_health_reporter_priv(reporter); + struct rvu_devlink *rvu_dl = rvu->rvu_dl; + struct rvu_npa_event_ctx *npa_ctx; + + npa_ctx = rvu_dl->rvu_npa_health_reporter->npa_event_ctx; + + return ctx ? rvu_npa_report_show(fmsg, ctx, NPA_AF_RVU_RAS) : + rvu_npa_report_show(fmsg, npa_ctx, NPA_AF_RVU_RAS); +} + +static int rvu_hw_npa_ras_recover(struct devlink_health_reporter *reporter, + void *ctx, struct netlink_ext_ack *netlink_extack) +{ + struct rvu *rvu = devlink_health_reporter_priv(reporter); + struct rvu_npa_event_ctx *npa_event_ctx = ctx; + int blkaddr; + + blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPA, 0); + if (blkaddr < 0) + return blkaddr; + + if (npa_event_ctx->npa_af_rvu_ras) + rvu_write64(rvu, blkaddr, NPA_AF_RAS_ENA_W1S, ~0ULL); + + return 0; +} + +RVU_REPORTERS(hw_npa_intr); +RVU_REPORTERS(hw_npa_gen); +RVU_REPORTERS(hw_npa_err); +RVU_REPORTERS(hw_npa_ras); + +static void rvu_npa_health_reporters_destroy(struct rvu_devlink *rvu_dl); + +static int rvu_npa_register_reporters(struct rvu_devlink *rvu_dl) +{ + struct rvu_npa_health_reporters *rvu_reporters; + struct rvu_npa_event_ctx *npa_event_context; + struct rvu *rvu = rvu_dl->rvu; + + rvu_reporters = kzalloc(sizeof(*rvu_reporters), GFP_KERNEL); + if (!rvu_reporters) + return -ENOMEM; + + rvu_dl->rvu_npa_health_reporter = rvu_reporters; + npa_event_context = kzalloc(sizeof(*npa_event_context), GFP_KERNEL); + if (!npa_event_context) + return -ENOMEM; + + rvu_reporters->npa_event_ctx = npa_event_context; + rvu_reporters->rvu_hw_npa_intr_reporter = + devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_npa_intr_reporter_ops, 0, rvu); + if (IS_ERR(rvu_reporters->rvu_hw_npa_intr_reporter)) { + dev_warn(rvu->dev, "Failed to create hw_npa_intr reporter, err=%ld\n", + PTR_ERR(rvu_reporters->rvu_hw_npa_intr_reporter)); + return PTR_ERR(rvu_reporters->rvu_hw_npa_intr_reporter); + } + + rvu_reporters->rvu_hw_npa_gen_reporter = + devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_npa_gen_reporter_ops, 0, rvu); + if (IS_ERR(rvu_reporters->rvu_hw_npa_gen_reporter)) { + dev_warn(rvu->dev, "Failed to create hw_npa_gen reporter, err=%ld\n", + PTR_ERR(rvu_reporters->rvu_hw_npa_gen_reporter)); + return PTR_ERR(rvu_reporters->rvu_hw_npa_gen_reporter); + } + + rvu_reporters->rvu_hw_npa_err_reporter = + devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_npa_err_reporter_ops, 0, rvu); + if (IS_ERR(rvu_reporters->rvu_hw_npa_err_reporter)) { + dev_warn(rvu->dev, "Failed to create hw_npa_err reporter, err=%ld\n", + PTR_ERR(rvu_reporters->rvu_hw_npa_err_reporter)); + return PTR_ERR(rvu_reporters->rvu_hw_npa_err_reporter); + } + + rvu_reporters->rvu_hw_npa_ras_reporter = + devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_npa_ras_reporter_ops, 0, rvu); + if (IS_ERR(rvu_reporters->rvu_hw_npa_ras_reporter)) { + dev_warn(rvu->dev, "Failed to create hw_npa_ras reporter, err=%ld\n", + PTR_ERR(rvu_reporters->rvu_hw_npa_ras_reporter)); + return PTR_ERR(rvu_reporters->rvu_hw_npa_ras_reporter); + } + + rvu_dl->devlink_wq = create_workqueue("rvu_devlink_wq"); + if (!rvu_dl->devlink_wq) + goto err; + + INIT_WORK(&rvu_reporters->intr_work, rvu_npa_intr_work); + INIT_WORK(&rvu_reporters->err_work, rvu_npa_err_work); + INIT_WORK(&rvu_reporters->gen_work, rvu_npa_gen_work); + INIT_WORK(&rvu_reporters->ras_work, rvu_npa_ras_work); + + return 0; +err: + rvu_npa_health_reporters_destroy(rvu_dl); + return -ENOMEM; +} + +static int rvu_npa_health_reporters_create(struct rvu_devlink *rvu_dl) +{ + struct rvu *rvu = rvu_dl->rvu; + int err; + + err = rvu_npa_register_reporters(rvu_dl); + if (err) { + dev_warn(rvu->dev, "Failed to create npa reporter, err =%d\n", + err); + return err; + } + rvu_npa_register_interrupts(rvu); + + return 0; +} + +static void rvu_npa_health_reporters_destroy(struct rvu_devlink *rvu_dl) +{ + struct rvu_npa_health_reporters *npa_reporters; + struct rvu *rvu = rvu_dl->rvu; + + npa_reporters = rvu_dl->rvu_npa_health_reporter; + + if (!npa_reporters->rvu_hw_npa_ras_reporter) + return; + if (!IS_ERR_OR_NULL(npa_reporters->rvu_hw_npa_intr_reporter)) + devlink_health_reporter_destroy(npa_reporters->rvu_hw_npa_intr_reporter); + + if (!IS_ERR_OR_NULL(npa_reporters->rvu_hw_npa_gen_reporter)) + devlink_health_reporter_destroy(npa_reporters->rvu_hw_npa_gen_reporter); + + if (!IS_ERR_OR_NULL(npa_reporters->rvu_hw_npa_err_reporter)) + devlink_health_reporter_destroy(npa_reporters->rvu_hw_npa_err_reporter); + + if (!IS_ERR_OR_NULL(npa_reporters->rvu_hw_npa_ras_reporter)) + devlink_health_reporter_destroy(npa_reporters->rvu_hw_npa_ras_reporter); + + rvu_npa_unregister_interrupts(rvu); + kfree(rvu_dl->rvu_npa_health_reporter->npa_event_ctx); + kfree(rvu_dl->rvu_npa_health_reporter); +} + +static int rvu_health_reporters_create(struct rvu *rvu) +{ + struct rvu_devlink *rvu_dl; + + rvu_dl = rvu->rvu_dl; + return rvu_npa_health_reporters_create(rvu_dl); +} + +static void rvu_health_reporters_destroy(struct rvu *rvu) +{ + struct rvu_devlink *rvu_dl; + + if (!rvu->rvu_dl) + return; + + rvu_dl = rvu->rvu_dl; + rvu_npa_health_reporters_destroy(rvu_dl); +} + +static int rvu_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req, + struct netlink_ext_ack *extack) +{ + return devlink_info_driver_name_put(req, DRV_NAME); +} + +static const struct devlink_ops rvu_devlink_ops = { + .info_get = rvu_devlink_info_get, +}; + +int rvu_register_dl(struct rvu *rvu) +{ + struct rvu_devlink *rvu_dl; + struct devlink *dl; + int err; + + rvu_dl = kzalloc(sizeof(*rvu_dl), GFP_KERNEL); + if (!rvu_dl) + return -ENOMEM; + + dl = devlink_alloc(&rvu_devlink_ops, sizeof(struct rvu_devlink)); + if (!dl) { + dev_warn(rvu->dev, "devlink_alloc failed\n"); + kfree(rvu_dl); + return -ENOMEM; + } + + err = devlink_register(dl, rvu->dev); + if (err) { + dev_err(rvu->dev, "devlink register failed with error %d\n", err); + devlink_free(dl); + kfree(rvu_dl); + return err; + } + + rvu_dl->dl = dl; + rvu_dl->rvu = rvu; + rvu->rvu_dl = rvu_dl; + + return rvu_health_reporters_create(rvu); +} + +void rvu_unregister_dl(struct rvu *rvu) +{ + struct rvu_devlink *rvu_dl = rvu->rvu_dl; + struct devlink *dl = rvu_dl->dl; + + if (!dl) + return; + + rvu_health_reporters_destroy(rvu); + devlink_unregister(dl); + devlink_free(dl); + kfree(rvu_dl); +} diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.h new file mode 100644 index 000000000000..d7578fa92ac1 --- /dev/null +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Marvell OcteonTx2 RVU Devlink + * + * Copyright (C) 2020 Marvell. + * + */ + +#ifndef RVU_DEVLINK_H +#define RVU_DEVLINK_H + +#define RVU_REPORTERS(_name) \ +static const struct devlink_health_reporter_ops rvu_ ## _name ## _reporter_ops = { \ + .name = #_name, \ + .recover = rvu_ ## _name ## _recover, \ + .dump = rvu_ ## _name ## _dump, \ +} + +enum npa_af_rvu_health { + NPA_AF_RVU_INTR, + NPA_AF_RVU_GEN, + NPA_AF_RVU_ERR, + NPA_AF_RVU_RAS, +}; + +struct rvu_npa_event_ctx { + u64 npa_af_rvu_int; + u64 npa_af_rvu_gen; + u64 npa_af_rvu_err; + u64 npa_af_rvu_ras; +}; + +struct rvu_npa_health_reporters { + struct rvu_npa_event_ctx *npa_event_ctx; + struct devlink_health_reporter *rvu_hw_npa_intr_reporter; + struct work_struct intr_work; + struct devlink_health_reporter *rvu_hw_npa_gen_reporter; + struct work_struct gen_work; + struct devlink_health_reporter *rvu_hw_npa_err_reporter; + struct work_struct err_work; + struct devlink_health_reporter *rvu_hw_npa_ras_reporter; + struct work_struct ras_work; +}; + +struct rvu_devlink { + struct devlink *dl; + struct rvu *rvu; + struct workqueue_struct *devlink_wq; + struct rvu_npa_health_reporters *rvu_npa_health_reporter; +}; + +/* Devlink APIs */ +int rvu_register_dl(struct rvu *rvu); +void rvu_unregister_dl(struct rvu *rvu); + +#endif /* RVU_DEVLINK_H */ diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h index 723643868589..e2153d47c373 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h @@ -64,6 +64,16 @@ enum rvu_af_int_vec_e { RVU_AF_INT_VEC_CNT = 0x5, }; +/* NPA Admin function Interrupt Vector Enumeration */ +enum npa_af_int_vec_e { + NPA_AF_INT_VEC_RVU = 0x0, + NPA_AF_INT_VEC_GEN = 0x1, + NPA_AF_INT_VEC_AQ_DONE = 0x2, + NPA_AF_INT_VEC_AF_ERR = 0x3, + NPA_AF_INT_VEC_POISON = 0x4, + NPA_AF_INT_VEC_CNT = 0x5, +}; + /** * RVU PF Interrupt Vector Enumeration */ @@ -104,6 +114,19 @@ enum npa_aq_instop { NPA_AQ_INSTOP_UNLOCK = 0x5, }; +/* ALLOC/FREE input queues Enumeration from coprocessors */ +enum npa_inpq { + NPA_INPQ_NIX0_RX = 0x0, + NPA_INPQ_NIX0_TX = 0x1, + NPA_INPQ_NIX1_RX = 0x2, + NPA_INPQ_NIX1_TX = 0x3, + NPA_INPQ_SSO = 0x4, + NPA_INPQ_TIM = 0x5, + NPA_INPQ_DPI = 0x6, + NPA_INPQ_AURA_OP = 0xe, + NPA_INPQ_INTERNAL_RSV = 0xf, +}; + /* NPA admin queue instruction structure */ struct npa_aq_inst_s { #if defined(__BIG_ENDIAN_BITFIELD) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_path.c b/drivers/net/ethernet/mediatek/mtk_eth_path.c index 6bc9f2487384..72648535a14d 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_path.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_path.c @@ -252,7 +252,7 @@ int mtk_gmac_sgmii_path_setup(struct mtk_eth *eth, int mac_id) int mtk_gmac_gephy_path_setup(struct mtk_eth *eth, int mac_id) { - int err, path = 0; + int path = 0; if (mac_id == 1) path = MTK_ETH_PATH_GMAC2_GEPHY; @@ -261,25 +261,17 @@ int mtk_gmac_gephy_path_setup(struct mtk_eth *eth, int mac_id) return -EINVAL; /* Setup proper MUXes along the path */ - err = mtk_eth_mux_setup(eth, path); - if (err) - return err; - - return 0; + return mtk_eth_mux_setup(eth, path); } int mtk_gmac_rgmii_path_setup(struct mtk_eth *eth, int mac_id) { - int err, path; + int path; path = (mac_id == 0) ? MTK_ETH_PATH_GMAC1_RGMII : MTK_ETH_PATH_GMAC2_RGMII; /* Setup proper MUXes along the path */ - err = mtk_eth_mux_setup(eth, path); - if (err) - return err; - - return 0; + return mtk_eth_mux_setup(eth, path); } diff --git a/drivers/net/ethernet/mellanox/mlxsw/Makefile b/drivers/net/ethernet/mellanox/mlxsw/Makefile index 892724380ea2..f545fd2c5896 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/Makefile +++ b/drivers/net/ethernet/mellanox/mlxsw/Makefile @@ -15,6 +15,7 @@ mlxsw_switchx2-objs := switchx2.o obj-$(CONFIG_MLXSW_SPECTRUM) += mlxsw_spectrum.o mlxsw_spectrum-objs := spectrum.o spectrum_buffers.o \ spectrum_switchdev.o spectrum_router.o \ + spectrum_router_xm.o \ spectrum1_kvdl.o spectrum2_kvdl.o \ spectrum_kvdl.o \ spectrum_acl_tcam.o spectrum_acl_ctcam.o \ diff --git a/drivers/net/ethernet/mellanox/mlxsw/cmd.h b/drivers/net/ethernet/mellanox/mlxsw/cmd.h index 5ffdfb532cb7..392ce3cb27f7 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/cmd.h +++ b/drivers/net/ethernet/mellanox/mlxsw/cmd.h @@ -343,6 +343,23 @@ static inline int mlxsw_cmd_boardinfo(struct mlxsw_core *mlxsw_core, 0, 0, false, out_mbox, MLXSW_CMD_MBOX_SIZE); } +/* cmd_mbox_xm_num_local_ports + * Number of local_ports connected to the xm. + * Each local port is a 4x + * Spectrum-2/3: 25G + * Spectrum-4: 50G + */ +MLXSW_ITEM32(cmd_mbox, boardinfo, xm_num_local_ports, 0x00, 4, 3); + +/* cmd_mbox_xm_exists + * An XM (eXtanded Mezanine, e.g. used for the XLT) is connected on the board. + */ +MLXSW_ITEM32(cmd_mbox, boardinfo, xm_exists, 0x00, 0, 1); + +/* cmd_mbox_xm_local_port_entry + */ +MLXSW_ITEM_BIT_ARRAY(cmd_mbox, boardinfo, xm_local_port_entry, 0x04, 4, 8); + /* cmd_mbox_boardinfo_intapin * When PCIe interrupt messages are being used, this value is used for clearing * an interrupt. When using MSI-X, this register is not used. @@ -657,6 +674,12 @@ MLXSW_ITEM32(cmd_mbox, config_profile, set_kvd_hash_double_size, 0x0C, 26, 1); */ MLXSW_ITEM32(cmd_mbox, config_profile, set_cqe_version, 0x08, 0, 1); +/* cmd_mbox_config_set_kvh_xlt_cache_mode + * Capability bit. Setting a bit to 1 configures the profile + * according to the mailbox contents. + */ +MLXSW_ITEM32(cmd_mbox, config_profile, set_kvh_xlt_cache_mode, 0x08, 3, 1); + /* cmd_mbox_config_profile_max_vepa_channels * Maximum number of VEPA channels per port (0 through 16) * 0 - multi-channel VEPA is disabled @@ -783,6 +806,13 @@ MLXSW_ITEM32(cmd_mbox, config_profile, adaptive_routing_group_cap, 0x4C, 0, 16); */ MLXSW_ITEM32(cmd_mbox, config_profile, arn, 0x50, 31, 1); +/* cmd_mbox_config_profile_kvh_xlt_cache_mode + * KVH XLT cache mode: + * 0 - XLT can use all KVH as best-effort + * 1 - XLT cache uses 1/2 KVH + */ +MLXSW_ITEM32(cmd_mbox, config_profile, kvh_xlt_cache_mode, 0x50, 8, 4); + /* cmd_mbox_config_kvd_linear_size * KVD Linear Size * Valid for Spectrum only diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index c67825a68a26..685037e052af 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -2856,6 +2856,18 @@ mlxsw_core_port_devlink_port_get(struct mlxsw_core *mlxsw_core, } EXPORT_SYMBOL(mlxsw_core_port_devlink_port_get); +bool mlxsw_core_port_is_xm(const struct mlxsw_core *mlxsw_core, u8 local_port) +{ + const struct mlxsw_bus_info *bus_info = mlxsw_core->bus_info; + int i; + + for (i = 0; i < bus_info->xm_local_ports_count; i++) + if (bus_info->xm_local_ports[i] == local_port) + return true; + return false; +} +EXPORT_SYMBOL(mlxsw_core_port_is_xm); + struct mlxsw_env *mlxsw_core_env(const struct mlxsw_core *mlxsw_core) { return mlxsw_core->env; diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h index 92f7398287be..6b3ccbf6b238 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.h +++ b/drivers/net/ethernet/mellanox/mlxsw/core.h @@ -223,6 +223,7 @@ enum devlink_port_type mlxsw_core_port_type_get(struct mlxsw_core *mlxsw_core, struct devlink_port * mlxsw_core_port_devlink_port_get(struct mlxsw_core *mlxsw_core, u8 local_port); +bool mlxsw_core_port_is_xm(const struct mlxsw_core *mlxsw_core, u8 local_port); struct mlxsw_env *mlxsw_core_env(const struct mlxsw_core *mlxsw_core); bool mlxsw_core_is_initialized(const struct mlxsw_core *mlxsw_core); int mlxsw_core_module_max_width(struct mlxsw_core *mlxsw_core, u8 module); @@ -255,7 +256,8 @@ struct mlxsw_config_profile { used_max_pkey:1, used_ar_sec:1, used_adaptive_routing_group_cap:1, - used_kvd_sizes:1; + used_kvd_sizes:1, + used_kvh_xlt_cache_mode:1; u8 max_vepa_channels; u16 max_mid; u16 max_pgt; @@ -277,6 +279,7 @@ struct mlxsw_config_profile { u32 kvd_linear_size; u8 kvd_hash_single_parts; u8 kvd_hash_double_parts; + u8 kvh_xlt_cache_mode; struct mlxsw_swid_config swid_config[MLXSW_CONFIG_PROFILE_SWID_COUNT]; }; @@ -435,6 +438,8 @@ struct mlxsw_fw_rev { u16 can_reset_minor; }; +#define MLXSW_BUS_INFO_XM_LOCAL_PORTS_MAX 4 + struct mlxsw_bus_info { const char *device_kind; const char *device_name; @@ -443,7 +448,10 @@ struct mlxsw_bus_info { u8 vsd[MLXSW_CMD_BOARDINFO_VSD_LEN]; u8 psid[MLXSW_CMD_BOARDINFO_PSID_LEN]; u8 low_frequency:1, - read_frc_capable:1; + read_frc_capable:1, + xm_exists:1; + u8 xm_local_ports_count; + u8 xm_local_ports[MLXSW_BUS_INFO_XM_LOCAL_PORTS_MAX]; }; struct mlxsw_hwmon; diff --git a/drivers/net/ethernet/mellanox/mlxsw/minimal.c b/drivers/net/ethernet/mellanox/mlxsw/minimal.c index c010db2c9dba..b34c44723f8b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/minimal.c +++ b/drivers/net/ethernet/mellanox/mlxsw/minimal.c @@ -291,7 +291,8 @@ static int mlxsw_m_ports_create(struct mlxsw_m *mlxsw_m) /* Create port objects for each valid entry */ for (i = 0; i < mlxsw_m->max_ports; i++) { - if (mlxsw_m->module_to_port[i] > 0) { + if (mlxsw_m->module_to_port[i] > 0 && + !mlxsw_core_port_is_xm(mlxsw_m->core, i)) { err = mlxsw_m_port_create(mlxsw_m, mlxsw_m->module_to_port[i], i); diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c index 641cdd81882b..4eeae8d78006 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci.c +++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c @@ -1196,6 +1196,12 @@ static int mlxsw_pci_config_profile(struct mlxsw_pci *mlxsw_pci, char *mbox, mlxsw_cmd_mbox_config_profile_kvd_hash_double_size_set(mbox, MLXSW_RES_GET(res, KVD_DOUBLE_SIZE)); } + if (profile->used_kvh_xlt_cache_mode) { + mlxsw_cmd_mbox_config_profile_set_kvh_xlt_cache_mode_set( + mbox, 1); + mlxsw_cmd_mbox_config_profile_kvh_xlt_cache_mode_set( + mbox, profile->kvh_xlt_cache_mode); + } for (i = 0; i < MLXSW_CONFIG_PROFILE_SWID_COUNT; i++) mlxsw_pci_config_profile_swid_config(mlxsw_pci, mbox, i, @@ -1209,6 +1215,30 @@ static int mlxsw_pci_config_profile(struct mlxsw_pci *mlxsw_pci, char *mbox, return mlxsw_cmd_config_profile_set(mlxsw_pci->core, mbox); } +static int mlxsw_pci_boardinfo_xm_process(struct mlxsw_pci *mlxsw_pci, + struct mlxsw_bus_info *bus_info, + char *mbox) +{ + int count = mlxsw_cmd_mbox_boardinfo_xm_num_local_ports_get(mbox); + int i; + + if (!mlxsw_cmd_mbox_boardinfo_xm_exists_get(mbox)) + return 0; + + bus_info->xm_exists = true; + + if (count > MLXSW_BUS_INFO_XM_LOCAL_PORTS_MAX) { + dev_err(&mlxsw_pci->pdev->dev, "Invalid number of XM local ports\n"); + return -EINVAL; + } + bus_info->xm_local_ports_count = count; + for (i = 0; i < count; i++) + bus_info->xm_local_ports[i] = + mlxsw_cmd_mbox_boardinfo_xm_local_port_entry_get(mbox, + i); + return 0; +} + static int mlxsw_pci_boardinfo(struct mlxsw_pci *mlxsw_pci, char *mbox) { struct mlxsw_bus_info *bus_info = &mlxsw_pci->bus_info; @@ -1220,7 +1250,8 @@ static int mlxsw_pci_boardinfo(struct mlxsw_pci *mlxsw_pci, char *mbox) return err; mlxsw_cmd_mbox_boardinfo_vsd_memcpy_from(mbox, bus_info->vsd); mlxsw_cmd_mbox_boardinfo_psid_memcpy_from(mbox, bus_info->psid); - return 0; + + return mlxsw_pci_boardinfo_xm_process(mlxsw_pci, bus_info, mbox); } static int mlxsw_pci_fw_area_init(struct mlxsw_pci *mlxsw_pci, char *mbox, diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index 2a89b3261f00..16e2df6ef2f4 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -8469,10 +8469,582 @@ mlxsw_reg_rmft2_ipv6_pack(char *payload, bool v, u16 offset, u16 virtual_router, mlxsw_reg_rmft2_sip6_mask_memcpy_to(payload, (void *)&sip6_mask); } -/* Note that XRALXX register position violates the rule of ordering register - * definition by the ID. However, XRALXX pack helpers are using RALXX pack - * helpers, RALXX registers have higher IDs. +/* RXLTE - Router XLT Enable Register + * ---------------------------------- + * The RXLTE enables XLT (eXtended Lookup Table) LPM lookups if a capable + * XM is present on the system. + */ + +#define MLXSW_REG_RXLTE_ID 0x8050 +#define MLXSW_REG_RXLTE_LEN 0x0C + +MLXSW_REG_DEFINE(rxlte, MLXSW_REG_RXLTE_ID, MLXSW_REG_RXLTE_LEN); + +/* reg_rxlte_virtual_router + * Virtual router ID associated with the router interface. + * Range is 0..cap_max_virtual_routers-1 + * Access: Index + */ +MLXSW_ITEM32(reg, rxlte, virtual_router, 0x00, 0, 16); + +enum mlxsw_reg_rxlte_protocol { + MLXSW_REG_RXLTE_PROTOCOL_IPV4, + MLXSW_REG_RXLTE_PROTOCOL_IPV6, +}; + +/* reg_rxlte_protocol + * Access: Index + */ +MLXSW_ITEM32(reg, rxlte, protocol, 0x04, 0, 4); + +/* reg_rxlte_lpm_xlt_en + * Access: RW */ +MLXSW_ITEM32(reg, rxlte, lpm_xlt_en, 0x08, 0, 1); + +static inline void mlxsw_reg_rxlte_pack(char *payload, u16 virtual_router, + enum mlxsw_reg_rxlte_protocol protocol, + bool lpm_xlt_en) +{ + MLXSW_REG_ZERO(rxlte, payload); + mlxsw_reg_rxlte_virtual_router_set(payload, virtual_router); + mlxsw_reg_rxlte_protocol_set(payload, protocol); + mlxsw_reg_rxlte_lpm_xlt_en_set(payload, lpm_xlt_en); +} + +/* RXLTM - Router XLT M select Register + * ------------------------------------ + * The RXLTM configures and selects the M for the XM lookups. + */ + +#define MLXSW_REG_RXLTM_ID 0x8051 +#define MLXSW_REG_RXLTM_LEN 0x14 + +MLXSW_REG_DEFINE(rxltm, MLXSW_REG_RXLTM_ID, MLXSW_REG_RXLTM_LEN); + +/* reg_rxltm_m0_val_v6 + * Global M0 value For IPv6. + * Range 0..128 + * Access: RW + */ +MLXSW_ITEM32(reg, rxltm, m0_val_v6, 0x10, 16, 8); + +/* reg_rxltm_m0_val_v4 + * Global M0 value For IPv4. + * Range 0..32 + * Access: RW + */ +MLXSW_ITEM32(reg, rxltm, m0_val_v4, 0x10, 0, 6); + +static inline void mlxsw_reg_rxltm_pack(char *payload, u8 m0_val_v4, u8 m0_val_v6) +{ + MLXSW_REG_ZERO(rxltm, payload); + mlxsw_reg_rxltm_m0_val_v6_set(payload, m0_val_v6); + mlxsw_reg_rxltm_m0_val_v4_set(payload, m0_val_v4); +} + +/* RLCMLD - Router LPM Cache ML Delete Register + * -------------------------------------------- + * The RLCMLD register is used to bulk delete the XLT-LPM cache ML entries. + * This can be used by SW when L is increased or decreased, thus need to + * remove entries with old ML values. + */ + +#define MLXSW_REG_RLCMLD_ID 0x8055 +#define MLXSW_REG_RLCMLD_LEN 0x30 + +MLXSW_REG_DEFINE(rlcmld, MLXSW_REG_RLCMLD_ID, MLXSW_REG_RLCMLD_LEN); + +enum mlxsw_reg_rlcmld_select { + MLXSW_REG_RLCMLD_SELECT_ML_ENTRIES, + MLXSW_REG_RLCMLD_SELECT_M_ENTRIES, + MLXSW_REG_RLCMLD_SELECT_M_AND_ML_ENTRIES, +}; + +/* reg_rlcmld_select + * Which entries to delete. + * Access: Index + */ +MLXSW_ITEM32(reg, rlcmld, select, 0x00, 16, 2); + +enum mlxsw_reg_rlcmld_filter_fields { + MLXSW_REG_RLCMLD_FILTER_FIELDS_BY_PROTOCOL = 0x04, + MLXSW_REG_RLCMLD_FILTER_FIELDS_BY_VIRTUAL_ROUTER = 0x08, + MLXSW_REG_RLCMLD_FILTER_FIELDS_BY_DIP = 0x10, +}; + +/* reg_rlcmld_filter_fields + * If a bit is '0' then the relevant field is ignored. + * Access: Index + */ +MLXSW_ITEM32(reg, rlcmld, filter_fields, 0x00, 0, 8); + +enum mlxsw_reg_rlcmld_protocol { + MLXSW_REG_RLCMLD_PROTOCOL_UC_IPV4, + MLXSW_REG_RLCMLD_PROTOCOL_UC_IPV6, +}; + +/* reg_rlcmld_protocol + * Access: Index + */ +MLXSW_ITEM32(reg, rlcmld, protocol, 0x08, 0, 4); + +/* reg_rlcmld_virtual_router + * Virtual router ID. + * Range is 0..cap_max_virtual_routers-1 + * Access: Index + */ +MLXSW_ITEM32(reg, rlcmld, virtual_router, 0x0C, 0, 16); + +/* reg_rlcmld_dip + * The prefix of the route or of the marker that the object of the LPM + * is compared with. The most significant bits of the dip are the prefix. + * Access: Index + */ +MLXSW_ITEM32(reg, rlcmld, dip4, 0x1C, 0, 32); +MLXSW_ITEM_BUF(reg, rlcmld, dip6, 0x10, 16); + +/* reg_rlcmld_dip_mask + * per bit: + * 0: no match + * 1: match + * Access: Index + */ +MLXSW_ITEM32(reg, rlcmld, dip_mask4, 0x2C, 0, 32); +MLXSW_ITEM_BUF(reg, rlcmld, dip_mask6, 0x20, 16); + +static inline void __mlxsw_reg_rlcmld_pack(char *payload, + enum mlxsw_reg_rlcmld_select select, + enum mlxsw_reg_rlcmld_protocol protocol, + u16 virtual_router) +{ + u8 filter_fields = MLXSW_REG_RLCMLD_FILTER_FIELDS_BY_PROTOCOL | + MLXSW_REG_RLCMLD_FILTER_FIELDS_BY_VIRTUAL_ROUTER | + MLXSW_REG_RLCMLD_FILTER_FIELDS_BY_DIP; + + MLXSW_REG_ZERO(rlcmld, payload); + mlxsw_reg_rlcmld_select_set(payload, select); + mlxsw_reg_rlcmld_filter_fields_set(payload, filter_fields); + mlxsw_reg_rlcmld_protocol_set(payload, protocol); + mlxsw_reg_rlcmld_virtual_router_set(payload, virtual_router); +} + +static inline void mlxsw_reg_rlcmld_pack4(char *payload, + enum mlxsw_reg_rlcmld_select select, + u16 virtual_router, + u32 dip, u32 dip_mask) +{ + __mlxsw_reg_rlcmld_pack(payload, select, + MLXSW_REG_RLCMLD_PROTOCOL_UC_IPV4, + virtual_router); + mlxsw_reg_rlcmld_dip4_set(payload, dip); + mlxsw_reg_rlcmld_dip_mask4_set(payload, dip_mask); +} + +static inline void mlxsw_reg_rlcmld_pack6(char *payload, + enum mlxsw_reg_rlcmld_select select, + u16 virtual_router, + const void *dip, const void *dip_mask) +{ + __mlxsw_reg_rlcmld_pack(payload, select, + MLXSW_REG_RLCMLD_PROTOCOL_UC_IPV6, + virtual_router); + mlxsw_reg_rlcmld_dip6_memcpy_to(payload, dip); + mlxsw_reg_rlcmld_dip_mask6_memcpy_to(payload, dip_mask); +} + +/* RLPMCE - Router LPM Cache Enable Register + * ----------------------------------------- + * Allows disabling the LPM cache. Can be changed on the fly. + */ + +#define MLXSW_REG_RLPMCE_ID 0x8056 +#define MLXSW_REG_RLPMCE_LEN 0x4 + +MLXSW_REG_DEFINE(rlpmce, MLXSW_REG_RLPMCE_ID, MLXSW_REG_RLPMCE_LEN); + +/* reg_rlpmce_flush + * Flush: + * 0: do not flush the cache (default) + * 1: flush (clear) the cache + * Access: WO + */ +MLXSW_ITEM32(reg, rlpmce, flush, 0x00, 4, 1); + +/* reg_rlpmce_disable + * LPM cache: + * 0: enabled (default) + * 1: disabled + * Access: RW + */ +MLXSW_ITEM32(reg, rlpmce, disable, 0x00, 0, 1); + +static inline void mlxsw_reg_rlpmce_pack(char *payload, bool flush, + bool disable) +{ + MLXSW_REG_ZERO(rlpmce, payload); + mlxsw_reg_rlpmce_flush_set(payload, flush); + mlxsw_reg_rlpmce_disable_set(payload, disable); +} + +/* Note that XLTQ, XMDR, XRMT and XRALXX register positions violate the rule + * of ordering register definitions by the ID. However, XRALXX pack helpers are + * using RALXX pack helpers, RALXX registers have higher IDs. + * Also XMDR is using RALUE enums. XLRQ and XRMT are just put alongside with the + * related registers. + */ + +/* XLTQ - XM Lookup Table Query Register + * ------------------------------------- + */ +#define MLXSW_REG_XLTQ_ID 0x7802 +#define MLXSW_REG_XLTQ_LEN 0x2C + +MLXSW_REG_DEFINE(xltq, MLXSW_REG_XLTQ_ID, MLXSW_REG_XLTQ_LEN); + +enum mlxsw_reg_xltq_xm_device_id { + MLXSW_REG_XLTQ_XM_DEVICE_ID_UNKNOWN, + MLXSW_REG_XLTQ_XM_DEVICE_ID_XLT = 0xCF71, +}; + +/* reg_xltq_xm_device_id + * XM device ID. + * Access: RO + */ +MLXSW_ITEM32(reg, xltq, xm_device_id, 0x04, 0, 16); + +/* reg_xltq_xlt_cap_ipv4_lpm + * Access: RO + */ +MLXSW_ITEM32(reg, xltq, xlt_cap_ipv4_lpm, 0x10, 0, 1); + +/* reg_xltq_xlt_cap_ipv6_lpm + * Access: RO + */ +MLXSW_ITEM32(reg, xltq, xlt_cap_ipv6_lpm, 0x10, 1, 1); + +/* reg_xltq_cap_xlt_entries + * Number of XLT entries + * Note: SW must not fill more than 80% in order to avoid overflow + * Access: RO + */ +MLXSW_ITEM32(reg, xltq, cap_xlt_entries, 0x20, 0, 32); + +/* reg_xltq_cap_xlt_mtable + * XLT M-Table max size + * Access: RO + */ +MLXSW_ITEM32(reg, xltq, cap_xlt_mtable, 0x24, 0, 32); + +static inline void mlxsw_reg_xltq_pack(char *payload) +{ + MLXSW_REG_ZERO(xltq, payload); +} + +static inline void mlxsw_reg_xltq_unpack(char *payload, u16 *xm_device_id, bool *xlt_cap_ipv4_lpm, + bool *xlt_cap_ipv6_lpm, u32 *cap_xlt_entries, + u32 *cap_xlt_mtable) +{ + *xm_device_id = mlxsw_reg_xltq_xm_device_id_get(payload); + *xlt_cap_ipv4_lpm = mlxsw_reg_xltq_xlt_cap_ipv4_lpm_get(payload); + *xlt_cap_ipv6_lpm = mlxsw_reg_xltq_xlt_cap_ipv6_lpm_get(payload); + *cap_xlt_entries = mlxsw_reg_xltq_cap_xlt_entries_get(payload); + *cap_xlt_mtable = mlxsw_reg_xltq_cap_xlt_mtable_get(payload); +} + +/* XMDR - XM Direct Register + * ------------------------- + * The XMDR allows direct access to the XM device via the switch. + * Working in synchronous mode. FW waits for response from the XLT + * for each command. FW acks the XMDR accordingly. + */ +#define MLXSW_REG_XMDR_ID 0x7803 +#define MLXSW_REG_XMDR_BASE_LEN 0x20 +#define MLXSW_REG_XMDR_TRANS_LEN 0x80 +#define MLXSW_REG_XMDR_LEN (MLXSW_REG_XMDR_BASE_LEN + \ + MLXSW_REG_XMDR_TRANS_LEN) + +MLXSW_REG_DEFINE(xmdr, MLXSW_REG_XMDR_ID, MLXSW_REG_XMDR_LEN); + +/* reg_xmdr_bulk_entry + * Bulk_entry + * 0: Last entry - immediate flush of XRT-cache + * 1: Bulk entry - do not flush the XRT-cache + * Access: OP + */ +MLXSW_ITEM32(reg, xmdr, bulk_entry, 0x04, 8, 1); + +/* reg_xmdr_num_rec + * Number of records for Direct access to XM + * Supported: 0..4 commands (except NOP which is a filler) + * 0 commands is reserved when bulk_entry = 1. + * 0 commands is allowed when bulk_entry = 0 for immediate XRT-cache flush. + * Access: OP + */ +MLXSW_ITEM32(reg, xmdr, num_rec, 0x04, 0, 4); + +/* reg_xmdr_reply_vect + * Reply Vector + * Bit i for command index i+1 + * values per bit: + * 0: failed + * 1: succeeded + * e.g. if commands 1, 2, 4 succeeded and command 3 failed then binary + * value will be 0b1011 + * Access: RO + */ +MLXSW_ITEM_BIT_ARRAY(reg, xmdr, reply_vect, 0x08, 4, 1); + +static inline void mlxsw_reg_xmdr_pack(char *payload, bool bulk_entry) +{ + MLXSW_REG_ZERO(xmdr, payload); + mlxsw_reg_xmdr_bulk_entry_set(payload, bulk_entry); +} + +enum mlxsw_reg_xmdr_c_cmd_id { + MLXSW_REG_XMDR_C_CMD_ID_LT_ROUTE_V4 = 0x30, + MLXSW_REG_XMDR_C_CMD_ID_LT_ROUTE_V6 = 0x31, +}; + +#define MLXSW_REG_XMDR_C_LT_ROUTE_V4_LEN 32 +#define MLXSW_REG_XMDR_C_LT_ROUTE_V6_LEN 48 + +/* reg_xmdr_c_cmd_id + */ +MLXSW_ITEM32(reg, xmdr_c, cmd_id, 0x00, 24, 8); + +/* reg_xmdr_c_seq_number + */ +MLXSW_ITEM32(reg, xmdr_c, seq_number, 0x00, 12, 12); + +enum mlxsw_reg_xmdr_c_ltr_op { + /* Activity is set */ + MLXSW_REG_XMDR_C_LTR_OP_WRITE = 0, + /* There is no update mask. All fields are updated. */ + MLXSW_REG_XMDR_C_LTR_OP_UPDATE = 1, + MLXSW_REG_XMDR_C_LTR_OP_DELETE = 2, +}; + +/* reg_xmdr_c_ltr_op + * Operation. + */ +MLXSW_ITEM32(reg, xmdr_c, ltr_op, 0x04, 24, 8); + +/* reg_xmdr_c_ltr_trap_action + * Trap action. + * Values are defined in enum mlxsw_reg_ralue_trap_action. + */ +MLXSW_ITEM32(reg, xmdr_c, ltr_trap_action, 0x04, 20, 4); + +enum mlxsw_reg_xmdr_c_ltr_trap_id_num { + MLXSW_REG_XMDR_C_LTR_TRAP_ID_NUM_RTR_INGRESS0, + MLXSW_REG_XMDR_C_LTR_TRAP_ID_NUM_RTR_INGRESS1, + MLXSW_REG_XMDR_C_LTR_TRAP_ID_NUM_RTR_INGRESS2, + MLXSW_REG_XMDR_C_LTR_TRAP_ID_NUM_RTR_INGRESS3, +}; + +/* reg_xmdr_c_ltr_trap_id_num + * Trap-ID number. + */ +MLXSW_ITEM32(reg, xmdr_c, ltr_trap_id_num, 0x04, 16, 4); + +/* reg_xmdr_c_ltr_virtual_router + * Virtual Router ID. + * Range is 0..cap_max_virtual_routers-1 + */ +MLXSW_ITEM32(reg, xmdr_c, ltr_virtual_router, 0x04, 0, 16); + +/* reg_xmdr_c_ltr_prefix_len + * Number of bits in the prefix of the LPM route. + */ +MLXSW_ITEM32(reg, xmdr_c, ltr_prefix_len, 0x08, 24, 8); + +/* reg_xmdr_c_ltr_bmp_len + * The best match prefix length in the case that there is no match for + * longer prefixes. + * If (entry_type != MARKER_ENTRY), bmp_len must be equal to prefix_len + */ +MLXSW_ITEM32(reg, xmdr_c, ltr_bmp_len, 0x08, 16, 8); + +/* reg_xmdr_c_ltr_entry_type + * Entry type. + * Values are defined in enum mlxsw_reg_ralue_entry_type. + */ +MLXSW_ITEM32(reg, xmdr_c, ltr_entry_type, 0x08, 4, 4); + +enum mlxsw_reg_xmdr_c_ltr_action_type { + MLXSW_REG_XMDR_C_LTR_ACTION_TYPE_LOCAL, + MLXSW_REG_XMDR_C_LTR_ACTION_TYPE_REMOTE, + MLXSW_REG_XMDR_C_LTR_ACTION_TYPE_IP2ME, +}; + +/* reg_xmdr_c_ltr_action_type + * Action Type. + */ +MLXSW_ITEM32(reg, xmdr_c, ltr_action_type, 0x08, 0, 4); + +/* reg_xmdr_c_ltr_erif + * Egress Router Interface. + * Only relevant in case of LOCAL action. + */ +MLXSW_ITEM32(reg, xmdr_c, ltr_erif, 0x10, 0, 16); + +/* reg_xmdr_c_ltr_adjacency_index + * Points to the first entry of the group-based ECMP. + * Only relevant in case of REMOTE action. + */ +MLXSW_ITEM32(reg, xmdr_c, ltr_adjacency_index, 0x10, 0, 24); + +#define MLXSW_REG_XMDR_C_LTR_POINTER_TO_TUNNEL_DISABLED_MAGIC 0xFFFFFF + +/* reg_xmdr_c_ltr_pointer_to_tunnel + * Only relevant in case of IP2ME action. + */ +MLXSW_ITEM32(reg, xmdr_c, ltr_pointer_to_tunnel, 0x10, 0, 24); + +/* reg_xmdr_c_ltr_ecmp_size + * Amount of sequential entries starting + * from the adjacency_index (the number of ECMPs). + * The valid range is 1-64, 512, 1024, 2048 and 4096. + * Only relevant in case of REMOTE action. + */ +MLXSW_ITEM32(reg, xmdr_c, ltr_ecmp_size, 0x14, 0, 32); + +/* reg_xmdr_c_ltr_dip* + * The prefix of the route or of the marker that the object of the LPM + * is compared with. The most significant bits of the dip are the prefix. + * The least significant bits must be '0' if the prefix_len is smaller + * than 128 for IPv6 or smaller than 32 for IPv4. + */ +MLXSW_ITEM32(reg, xmdr_c, ltr_dip4, 0x1C, 0, 32); +MLXSW_ITEM_BUF(reg, xmdr_c, ltr_dip6, 0x1C, 16); + +static inline void +mlxsw_reg_xmdr_c_ltr_pack(char *xmdr_payload, unsigned int trans_offset, + enum mlxsw_reg_xmdr_c_cmd_id cmd_id, u16 seq_number, + enum mlxsw_reg_xmdr_c_ltr_op op, u16 virtual_router, + u8 prefix_len) +{ + char *payload = xmdr_payload + MLXSW_REG_XMDR_BASE_LEN + trans_offset; + u8 num_rec = mlxsw_reg_xmdr_num_rec_get(xmdr_payload); + + mlxsw_reg_xmdr_num_rec_set(xmdr_payload, num_rec + 1); + + mlxsw_reg_xmdr_c_cmd_id_set(payload, cmd_id); + mlxsw_reg_xmdr_c_seq_number_set(payload, seq_number); + mlxsw_reg_xmdr_c_ltr_op_set(payload, op); + mlxsw_reg_xmdr_c_ltr_virtual_router_set(payload, virtual_router); + mlxsw_reg_xmdr_c_ltr_prefix_len_set(payload, prefix_len); + mlxsw_reg_xmdr_c_ltr_entry_type_set(payload, + MLXSW_REG_RALUE_ENTRY_TYPE_ROUTE_ENTRY); + mlxsw_reg_xmdr_c_ltr_bmp_len_set(payload, prefix_len); +} + +static inline unsigned int +mlxsw_reg_xmdr_c_ltr_pack4(char *xmdr_payload, unsigned int trans_offset, + u16 seq_number, enum mlxsw_reg_xmdr_c_ltr_op op, + u16 virtual_router, u8 prefix_len, u32 *dip) +{ + char *payload = xmdr_payload + MLXSW_REG_XMDR_BASE_LEN + trans_offset; + + mlxsw_reg_xmdr_c_ltr_pack(xmdr_payload, trans_offset, + MLXSW_REG_XMDR_C_CMD_ID_LT_ROUTE_V4, + seq_number, op, virtual_router, prefix_len); + if (dip) + mlxsw_reg_xmdr_c_ltr_dip4_set(payload, *dip); + return MLXSW_REG_XMDR_C_LT_ROUTE_V4_LEN; +} + +static inline unsigned int +mlxsw_reg_xmdr_c_ltr_pack6(char *xmdr_payload, unsigned int trans_offset, + u16 seq_number, enum mlxsw_reg_xmdr_c_ltr_op op, + u16 virtual_router, u8 prefix_len, const void *dip) +{ + char *payload = xmdr_payload + MLXSW_REG_XMDR_BASE_LEN + trans_offset; + + mlxsw_reg_xmdr_c_ltr_pack(xmdr_payload, trans_offset, + MLXSW_REG_XMDR_C_CMD_ID_LT_ROUTE_V6, + seq_number, op, virtual_router, prefix_len); + if (dip) + mlxsw_reg_xmdr_c_ltr_dip6_memcpy_to(payload, dip); + return MLXSW_REG_XMDR_C_LT_ROUTE_V6_LEN; +} + +static inline void +mlxsw_reg_xmdr_c_ltr_act_remote_pack(char *xmdr_payload, unsigned int trans_offset, + enum mlxsw_reg_ralue_trap_action trap_action, + enum mlxsw_reg_xmdr_c_ltr_trap_id_num trap_id_num, + u32 adjacency_index, u16 ecmp_size) +{ + char *payload = xmdr_payload + MLXSW_REG_XMDR_BASE_LEN + trans_offset; + + mlxsw_reg_xmdr_c_ltr_action_type_set(payload, MLXSW_REG_XMDR_C_LTR_ACTION_TYPE_REMOTE); + mlxsw_reg_xmdr_c_ltr_trap_action_set(payload, trap_action); + mlxsw_reg_xmdr_c_ltr_trap_id_num_set(payload, trap_id_num); + mlxsw_reg_xmdr_c_ltr_adjacency_index_set(payload, adjacency_index); + mlxsw_reg_xmdr_c_ltr_ecmp_size_set(payload, ecmp_size); +} + +static inline void +mlxsw_reg_xmdr_c_ltr_act_local_pack(char *xmdr_payload, unsigned int trans_offset, + enum mlxsw_reg_ralue_trap_action trap_action, + enum mlxsw_reg_xmdr_c_ltr_trap_id_num trap_id_num, u16 erif) +{ + char *payload = xmdr_payload + MLXSW_REG_XMDR_BASE_LEN + trans_offset; + + mlxsw_reg_xmdr_c_ltr_action_type_set(payload, MLXSW_REG_XMDR_C_LTR_ACTION_TYPE_LOCAL); + mlxsw_reg_xmdr_c_ltr_trap_action_set(payload, trap_action); + mlxsw_reg_xmdr_c_ltr_trap_id_num_set(payload, trap_id_num); + mlxsw_reg_xmdr_c_ltr_erif_set(payload, erif); +} + +static inline void mlxsw_reg_xmdr_c_ltr_act_ip2me_pack(char *xmdr_payload, + unsigned int trans_offset) +{ + char *payload = xmdr_payload + MLXSW_REG_XMDR_BASE_LEN + trans_offset; + + mlxsw_reg_xmdr_c_ltr_action_type_set(payload, MLXSW_REG_XMDR_C_LTR_ACTION_TYPE_IP2ME); + mlxsw_reg_xmdr_c_ltr_pointer_to_tunnel_set(payload, + MLXSW_REG_XMDR_C_LTR_POINTER_TO_TUNNEL_DISABLED_MAGIC); +} + +static inline void mlxsw_reg_xmdr_c_ltr_act_ip2me_tun_pack(char *xmdr_payload, + unsigned int trans_offset, + u32 pointer_to_tunnel) +{ + char *payload = xmdr_payload + MLXSW_REG_XMDR_BASE_LEN + trans_offset; + + mlxsw_reg_xmdr_c_ltr_action_type_set(payload, MLXSW_REG_XMDR_C_LTR_ACTION_TYPE_IP2ME); + mlxsw_reg_xmdr_c_ltr_pointer_to_tunnel_set(payload, pointer_to_tunnel); +} + +/* XRMT - XM Router M Table Register + * --------------------------------- + * The XRMT configures the M-Table for the XLT-LPM. + */ +#define MLXSW_REG_XRMT_ID 0x7810 +#define MLXSW_REG_XRMT_LEN 0x14 + +MLXSW_REG_DEFINE(xrmt, MLXSW_REG_XRMT_ID, MLXSW_REG_XRMT_LEN); + +/* reg_xrmt_index + * Index in M-Table. + * Range 0..cap_xlt_mtable-1 + * Access: Index + */ +MLXSW_ITEM32(reg, xrmt, index, 0x04, 0, 20); + +/* reg_xrmt_l0_val + * Access: RW + */ +MLXSW_ITEM32(reg, xrmt, l0_val, 0x10, 24, 8); + +static inline void mlxsw_reg_xrmt_pack(char *payload, u32 index, u8 l0_val) +{ + MLXSW_REG_ZERO(xrmt, payload); + mlxsw_reg_xrmt_index_set(payload, index); + mlxsw_reg_xrmt_l0_val_set(payload, l0_val); +} /* XRALTA - XM Router Algorithmic LPM Tree Allocation Register * ----------------------------------------------------------- @@ -11487,6 +12059,13 @@ static const struct mlxsw_reg_info *mlxsw_reg_infos[] = { MLXSW_REG(rigr2), MLXSW_REG(recr2), MLXSW_REG(rmft2), + MLXSW_REG(rxlte), + MLXSW_REG(rxltm), + MLXSW_REG(rlcmld), + MLXSW_REG(rlpmce), + MLXSW_REG(xltq), + MLXSW_REG(xmdr), + MLXSW_REG(xrmt), MLXSW_REG(xralta), MLXSW_REG(xralst), MLXSW_REG(xraltb), diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index df8175cd44ab..1650d9852b5b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -1840,6 +1840,9 @@ static int mlxsw_sp_port_module_info_init(struct mlxsw_sp *mlxsw_sp) return -ENOMEM; for (i = 1; i < max_ports; i++) { + if (mlxsw_core_port_is_xm(mlxsw_sp->core, i)) + continue; + err = mlxsw_sp_port_module_info_get(mlxsw_sp, i, &port_mapping); if (err) goto err_port_module_info_get; @@ -2933,6 +2936,8 @@ static const struct mlxsw_config_profile mlxsw_sp2_config_profile = { .max_ib_mc = 0, .used_max_pkey = 1, .max_pkey = 0, + .used_kvh_xlt_cache_mode = 1, + .kvh_xlt_cache_mode = 1, .swid_config = { { .used_type = 1, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index d671d961fc33..41424ee909a0 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -477,6 +477,12 @@ struct mlxsw_sp_vr { refcount_t ul_rif_refcnt; }; +static int mlxsw_sp_router_ll_basic_init(struct mlxsw_sp *mlxsw_sp, u16 vr_id, + enum mlxsw_sp_l3proto proto) +{ + return 0; +} + static int mlxsw_sp_router_ll_basic_ralta_write(struct mlxsw_sp *mlxsw_sp, char *xralta_pl) { return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralta), @@ -506,6 +512,10 @@ static struct mlxsw_sp_fib *mlxsw_sp_fib_create(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_fib *fib; int err; + err = ll_ops->init(mlxsw_sp, vr->id, proto); + if (err) + return ERR_PTR(err); + lpm_tree = mlxsw_sp->router->lpm.proto_trees[proto]; fib = kzalloc(sizeof(*fib), GFP_KERNEL); if (!fib) @@ -7088,6 +7098,7 @@ static void mlxsw_sp_router_fib_event_work(struct work_struct *work) op_ctx->bulk_ok = !list_is_last(&fib_event->list, &fib_event_queue) && fib_event->family == next_fib_event->family && fib_event->event == next_fib_event->event; + op_ctx->event = fib_event->event; /* In case family of this and the previous entry are different, context * reinitialization is going to be needed now, indicate that. @@ -9122,6 +9133,7 @@ static void __mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp) } static const struct mlxsw_sp_router_ll_ops mlxsw_sp_router_ll_basic_ops = { + .init = mlxsw_sp_router_ll_basic_init, .ralta_write = mlxsw_sp_router_ll_basic_ralta_write, .ralst_write = mlxsw_sp_router_ll_basic_ralst_write, .raltb_write = mlxsw_sp_router_ll_basic_raltb_write, @@ -9197,7 +9209,13 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp, mlxsw_sp->router = router; router->mlxsw_sp = mlxsw_sp; - router->proto_ll_ops[MLXSW_SP_L3_PROTO_IPV4] = &mlxsw_sp_router_ll_basic_ops; + err = mlxsw_sp_router_xm_init(mlxsw_sp); + if (err) + goto err_xm_init; + + router->proto_ll_ops[MLXSW_SP_L3_PROTO_IPV4] = mlxsw_sp_router_xm_ipv4_is_supported(mlxsw_sp) ? + &mlxsw_sp_router_ll_xm_ops : + &mlxsw_sp_router_ll_basic_ops; router->proto_ll_ops[MLXSW_SP_L3_PROTO_IPV6] = &mlxsw_sp_router_ll_basic_ops; err = mlxsw_sp_router_ll_op_ctx_init(router); @@ -9329,6 +9347,8 @@ err_rifs_init: err_router_init: mlxsw_sp_router_ll_op_ctx_fini(router); err_ll_op_ctx_init: + mlxsw_sp_router_xm_fini(mlxsw_sp); +err_xm_init: mutex_destroy(&mlxsw_sp->router->lock); kfree(mlxsw_sp->router); return err; @@ -9356,6 +9376,7 @@ void mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp) mlxsw_sp_rifs_fini(mlxsw_sp); __mlxsw_sp_router_fini(mlxsw_sp); mlxsw_sp_router_ll_op_ctx_fini(mlxsw_sp->router); + mlxsw_sp_router_xm_fini(mlxsw_sp); mutex_destroy(&mlxsw_sp->router->lock); kfree(mlxsw_sp->router); } diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h index d8aed866af21..2875ee8ec537 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h @@ -24,6 +24,7 @@ struct mlxsw_sp_fib_entry_op_ctx { * the context priv is initialized. */ struct list_head fib_entry_priv_list; + unsigned long event; unsigned long ll_priv[]; }; @@ -76,6 +77,7 @@ struct mlxsw_sp_router { const struct mlxsw_sp_router_ll_ops *proto_ll_ops[MLXSW_SP_L3_PROTO_MAX]; struct mlxsw_sp_fib_entry_op_ctx *ll_op_ctx; u16 lb_rif_index; + struct mlxsw_sp_router_xm *xm; }; struct mlxsw_sp_fib_entry_priv { @@ -94,6 +96,8 @@ enum mlxsw_sp_fib_entry_op { * register sets to work with ordinary and XM trees and FIB entries. */ struct mlxsw_sp_router_ll_ops { + int (*init)(struct mlxsw_sp *mlxsw_sp, u16 vr_id, + enum mlxsw_sp_l3proto proto); int (*ralta_write)(struct mlxsw_sp *mlxsw_sp, char *xralta_pl); int (*ralst_write)(struct mlxsw_sp *mlxsw_sp, char *xralst_pl); int (*raltb_write)(struct mlxsw_sp *mlxsw_sp, char *xraltb_pl); @@ -219,4 +223,10 @@ static inline bool mlxsw_sp_l3addr_eq(const union mlxsw_sp_l3addr *addr1, int mlxsw_sp_ipip_ecn_encap_init(struct mlxsw_sp *mlxsw_sp); int mlxsw_sp_ipip_ecn_decap_init(struct mlxsw_sp *mlxsw_sp); +extern const struct mlxsw_sp_router_ll_ops mlxsw_sp_router_ll_xm_ops; + +int mlxsw_sp_router_xm_init(struct mlxsw_sp *mlxsw_sp); +void mlxsw_sp_router_xm_fini(struct mlxsw_sp *mlxsw_sp); +bool mlxsw_sp_router_xm_ipv4_is_supported(const struct mlxsw_sp *mlxsw_sp); + #endif /* _MLXSW_ROUTER_H_*/ diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router_xm.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router_xm.c new file mode 100644 index 000000000000..d213af723a2a --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router_xm.c @@ -0,0 +1,812 @@ +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 +/* Copyright (c) 2020 Mellanox Technologies. All rights reserved */ + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/rhashtable.h> + +#include "spectrum.h" +#include "core.h" +#include "reg.h" +#include "spectrum_router.h" + +#define MLXSW_SP_ROUTER_XM_M_VAL 16 + +static const u8 mlxsw_sp_router_xm_m_val[] = { + [MLXSW_SP_L3_PROTO_IPV4] = MLXSW_SP_ROUTER_XM_M_VAL, + [MLXSW_SP_L3_PROTO_IPV6] = 0, /* Currently unused. */ +}; + +#define MLXSW_SP_ROUTER_XM_L_VAL_MAX 16 + +struct mlxsw_sp_router_xm { + bool ipv4_supported; + bool ipv6_supported; + unsigned int entries_size; + struct rhashtable ltable_ht; + struct rhashtable flush_ht; /* Stores items about to be flushed from cache */ + unsigned int flush_count; + bool flush_all_mode; +}; + +struct mlxsw_sp_router_xm_ltable_node { + struct rhash_head ht_node; /* Member of router_xm->ltable_ht */ + u16 mindex; + u8 current_lvalue; + refcount_t refcnt; + unsigned int lvalue_ref[MLXSW_SP_ROUTER_XM_L_VAL_MAX + 1]; +}; + +static const struct rhashtable_params mlxsw_sp_router_xm_ltable_ht_params = { + .key_offset = offsetof(struct mlxsw_sp_router_xm_ltable_node, mindex), + .head_offset = offsetof(struct mlxsw_sp_router_xm_ltable_node, ht_node), + .key_len = sizeof(u16), + .automatic_shrinking = true, +}; + +struct mlxsw_sp_router_xm_flush_info { + bool all; + enum mlxsw_sp_l3proto proto; + u16 virtual_router; + u8 prefix_len; + unsigned char addr[sizeof(struct in6_addr)]; +}; + +struct mlxsw_sp_router_xm_fib_entry { + bool committed; + struct mlxsw_sp_router_xm_ltable_node *ltable_node; /* Parent node */ + u16 mindex; /* Store for processing from commit op */ + u8 lvalue; + struct mlxsw_sp_router_xm_flush_info flush_info; +}; + +#define MLXSW_SP_ROUTE_LL_XM_ENTRIES_MAX \ + (MLXSW_REG_XMDR_TRANS_LEN / MLXSW_REG_XMDR_C_LT_ROUTE_V4_LEN) + +struct mlxsw_sp_fib_entry_op_ctx_xm { + bool initialized; + char xmdr_pl[MLXSW_REG_XMDR_LEN]; + unsigned int trans_offset; /* Offset of the current command within one + * transaction of XMDR register. + */ + unsigned int trans_item_len; /* The current command length. This is used + * to advance 'trans_offset' when the next + * command is appended. + */ + unsigned int entries_count; + struct mlxsw_sp_router_xm_fib_entry *entries[MLXSW_SP_ROUTE_LL_XM_ENTRIES_MAX]; +}; + +static int mlxsw_sp_router_ll_xm_init(struct mlxsw_sp *mlxsw_sp, u16 vr_id, + enum mlxsw_sp_l3proto proto) +{ + char rxlte_pl[MLXSW_REG_RXLTE_LEN]; + + mlxsw_reg_rxlte_pack(rxlte_pl, vr_id, + (enum mlxsw_reg_rxlte_protocol) proto, true); + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rxlte), rxlte_pl); +} + +static int mlxsw_sp_router_ll_xm_ralta_write(struct mlxsw_sp *mlxsw_sp, char *xralta_pl) +{ + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(xralta), xralta_pl); +} + +static int mlxsw_sp_router_ll_xm_ralst_write(struct mlxsw_sp *mlxsw_sp, char *xralst_pl) +{ + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(xralst), xralst_pl); +} + +static int mlxsw_sp_router_ll_xm_raltb_write(struct mlxsw_sp *mlxsw_sp, char *xraltb_pl) +{ + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(xraltb), xraltb_pl); +} + +static u16 mlxsw_sp_router_ll_xm_mindex_get4(const u32 addr) +{ + /* Currently the M-index is set to linear mode. That means it is defined + * as 16 MSB of IP address. + */ + return addr >> MLXSW_SP_ROUTER_XM_L_VAL_MAX; +} + +static u16 mlxsw_sp_router_ll_xm_mindex_get6(const unsigned char *addr) +{ + WARN_ON_ONCE(1); + return 0; /* currently unused */ +} + +static void mlxsw_sp_router_ll_xm_op_ctx_check_init(struct mlxsw_sp_fib_entry_op_ctx *op_ctx, + struct mlxsw_sp_fib_entry_op_ctx_xm *op_ctx_xm) +{ + if (op_ctx->initialized) + return; + op_ctx->initialized = true; + + mlxsw_reg_xmdr_pack(op_ctx_xm->xmdr_pl, true); + op_ctx_xm->trans_offset = 0; + op_ctx_xm->entries_count = 0; +} + +static void mlxsw_sp_router_ll_xm_fib_entry_pack(struct mlxsw_sp_fib_entry_op_ctx *op_ctx, + enum mlxsw_sp_l3proto proto, + enum mlxsw_sp_fib_entry_op op, + u16 virtual_router, u8 prefix_len, + unsigned char *addr, + struct mlxsw_sp_fib_entry_priv *priv) +{ + struct mlxsw_sp_fib_entry_op_ctx_xm *op_ctx_xm = (void *) op_ctx->ll_priv; + struct mlxsw_sp_router_xm_fib_entry *fib_entry = (void *) priv->priv; + struct mlxsw_sp_router_xm_flush_info *flush_info; + enum mlxsw_reg_xmdr_c_ltr_op xmdr_c_ltr_op; + unsigned int len; + + mlxsw_sp_router_ll_xm_op_ctx_check_init(op_ctx, op_ctx_xm); + + switch (op) { + case MLXSW_SP_FIB_ENTRY_OP_WRITE: + xmdr_c_ltr_op = MLXSW_REG_XMDR_C_LTR_OP_WRITE; + break; + case MLXSW_SP_FIB_ENTRY_OP_UPDATE: + xmdr_c_ltr_op = MLXSW_REG_XMDR_C_LTR_OP_UPDATE; + break; + case MLXSW_SP_FIB_ENTRY_OP_DELETE: + xmdr_c_ltr_op = MLXSW_REG_XMDR_C_LTR_OP_DELETE; + break; + default: + WARN_ON_ONCE(1); + return; + } + + switch (proto) { + case MLXSW_SP_L3_PROTO_IPV4: + len = mlxsw_reg_xmdr_c_ltr_pack4(op_ctx_xm->xmdr_pl, op_ctx_xm->trans_offset, + op_ctx_xm->entries_count, xmdr_c_ltr_op, + virtual_router, prefix_len, (u32 *) addr); + fib_entry->mindex = mlxsw_sp_router_ll_xm_mindex_get4(*((u32 *) addr)); + break; + case MLXSW_SP_L3_PROTO_IPV6: + len = mlxsw_reg_xmdr_c_ltr_pack6(op_ctx_xm->xmdr_pl, op_ctx_xm->trans_offset, + op_ctx_xm->entries_count, xmdr_c_ltr_op, + virtual_router, prefix_len, addr); + fib_entry->mindex = mlxsw_sp_router_ll_xm_mindex_get6(addr); + break; + default: + WARN_ON_ONCE(1); + return; + } + if (!op_ctx_xm->trans_offset) + op_ctx_xm->trans_item_len = len; + else + WARN_ON_ONCE(op_ctx_xm->trans_item_len != len); + + op_ctx_xm->entries[op_ctx_xm->entries_count] = fib_entry; + + fib_entry->lvalue = prefix_len > mlxsw_sp_router_xm_m_val[proto] ? + prefix_len - mlxsw_sp_router_xm_m_val[proto] : 0; + + flush_info = &fib_entry->flush_info; + flush_info->proto = proto; + flush_info->virtual_router = virtual_router; + flush_info->prefix_len = prefix_len; + if (addr) + memcpy(flush_info->addr, addr, sizeof(flush_info->addr)); + else + memset(flush_info->addr, 0, sizeof(flush_info->addr)); +} + +static void +mlxsw_sp_router_ll_xm_fib_entry_act_remote_pack(struct mlxsw_sp_fib_entry_op_ctx *op_ctx, + enum mlxsw_reg_ralue_trap_action trap_action, + u16 trap_id, u32 adjacency_index, u16 ecmp_size) +{ + struct mlxsw_sp_fib_entry_op_ctx_xm *op_ctx_xm = (void *) op_ctx->ll_priv; + + mlxsw_reg_xmdr_c_ltr_act_remote_pack(op_ctx_xm->xmdr_pl, op_ctx_xm->trans_offset, + trap_action, trap_id, adjacency_index, ecmp_size); +} + +static void +mlxsw_sp_router_ll_xm_fib_entry_act_local_pack(struct mlxsw_sp_fib_entry_op_ctx *op_ctx, + enum mlxsw_reg_ralue_trap_action trap_action, + u16 trap_id, u16 local_erif) +{ + struct mlxsw_sp_fib_entry_op_ctx_xm *op_ctx_xm = (void *) op_ctx->ll_priv; + + mlxsw_reg_xmdr_c_ltr_act_local_pack(op_ctx_xm->xmdr_pl, op_ctx_xm->trans_offset, + trap_action, trap_id, local_erif); +} + +static void +mlxsw_sp_router_ll_xm_fib_entry_act_ip2me_pack(struct mlxsw_sp_fib_entry_op_ctx *op_ctx) +{ + struct mlxsw_sp_fib_entry_op_ctx_xm *op_ctx_xm = (void *) op_ctx->ll_priv; + + mlxsw_reg_xmdr_c_ltr_act_ip2me_pack(op_ctx_xm->xmdr_pl, op_ctx_xm->trans_offset); +} + +static void +mlxsw_sp_router_ll_xm_fib_entry_act_ip2me_tun_pack(struct mlxsw_sp_fib_entry_op_ctx *op_ctx, + u32 tunnel_ptr) +{ + struct mlxsw_sp_fib_entry_op_ctx_xm *op_ctx_xm = (void *) op_ctx->ll_priv; + + mlxsw_reg_xmdr_c_ltr_act_ip2me_tun_pack(op_ctx_xm->xmdr_pl, op_ctx_xm->trans_offset, + tunnel_ptr); +} + +static struct mlxsw_sp_router_xm_ltable_node * +mlxsw_sp_router_xm_ltable_node_get(struct mlxsw_sp_router_xm *router_xm, u16 mindex) +{ + struct mlxsw_sp_router_xm_ltable_node *ltable_node; + int err; + + ltable_node = rhashtable_lookup_fast(&router_xm->ltable_ht, &mindex, + mlxsw_sp_router_xm_ltable_ht_params); + if (ltable_node) { + refcount_inc(<able_node->refcnt); + return ltable_node; + } + ltable_node = kzalloc(sizeof(*ltable_node), GFP_KERNEL); + if (!ltable_node) + return ERR_PTR(-ENOMEM); + ltable_node->mindex = mindex; + refcount_set(<able_node->refcnt, 1); + + err = rhashtable_insert_fast(&router_xm->ltable_ht, <able_node->ht_node, + mlxsw_sp_router_xm_ltable_ht_params); + if (err) + goto err_insert; + + return ltable_node; + +err_insert: + kfree(ltable_node); + return ERR_PTR(err); +} + +static void mlxsw_sp_router_xm_ltable_node_put(struct mlxsw_sp_router_xm *router_xm, + struct mlxsw_sp_router_xm_ltable_node *ltable_node) +{ + if (!refcount_dec_and_test(<able_node->refcnt)) + return; + rhashtable_remove_fast(&router_xm->ltable_ht, <able_node->ht_node, + mlxsw_sp_router_xm_ltable_ht_params); + kfree(ltable_node); +} + +static int mlxsw_sp_router_xm_ltable_lvalue_set(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_router_xm_ltable_node *ltable_node) +{ + char xrmt_pl[MLXSW_REG_XRMT_LEN]; + + mlxsw_reg_xrmt_pack(xrmt_pl, ltable_node->mindex, ltable_node->current_lvalue); + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(xrmt), xrmt_pl); +} + +struct mlxsw_sp_router_xm_flush_node { + struct rhash_head ht_node; /* Member of router_xm->flush_ht */ + struct list_head list; + struct mlxsw_sp_router_xm_flush_info flush_info; + struct delayed_work dw; + struct mlxsw_sp *mlxsw_sp; + unsigned long start_jiffies; + unsigned int reuses; /* By how many flush calls this was reused. */ + refcount_t refcnt; +}; + +static const struct rhashtable_params mlxsw_sp_router_xm_flush_ht_params = { + .key_offset = offsetof(struct mlxsw_sp_router_xm_flush_node, flush_info), + .head_offset = offsetof(struct mlxsw_sp_router_xm_flush_node, ht_node), + .key_len = sizeof(struct mlxsw_sp_router_xm_flush_info), + .automatic_shrinking = true, +}; + +static struct mlxsw_sp_router_xm_flush_node * +mlxsw_sp_router_xm_cache_flush_node_create(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_router_xm_flush_info *flush_info) +{ + struct mlxsw_sp_router_xm *router_xm = mlxsw_sp->router->xm; + struct mlxsw_sp_router_xm_flush_node *flush_node; + int err; + + flush_node = kzalloc(sizeof(*flush_node), GFP_KERNEL); + if (!flush_node) + return ERR_PTR(-ENOMEM); + + flush_node->flush_info = *flush_info; + err = rhashtable_insert_fast(&router_xm->flush_ht, &flush_node->ht_node, + mlxsw_sp_router_xm_flush_ht_params); + if (err) { + kfree(flush_node); + return ERR_PTR(err); + } + router_xm->flush_count++; + flush_node->mlxsw_sp = mlxsw_sp; + flush_node->start_jiffies = jiffies; + refcount_set(&flush_node->refcnt, 1); + return flush_node; +} + +static void +mlxsw_sp_router_xm_cache_flush_node_hold(struct mlxsw_sp_router_xm_flush_node *flush_node) +{ + if (!flush_node) + return; + refcount_inc(&flush_node->refcnt); +} + +static void +mlxsw_sp_router_xm_cache_flush_node_put(struct mlxsw_sp_router_xm_flush_node *flush_node) +{ + if (!flush_node || !refcount_dec_and_test(&flush_node->refcnt)) + return; + kfree(flush_node); +} + +static void +mlxsw_sp_router_xm_cache_flush_node_destroy(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_router_xm_flush_node *flush_node) +{ + struct mlxsw_sp_router_xm *router_xm = mlxsw_sp->router->xm; + + router_xm->flush_count--; + rhashtable_remove_fast(&router_xm->flush_ht, &flush_node->ht_node, + mlxsw_sp_router_xm_flush_ht_params); + mlxsw_sp_router_xm_cache_flush_node_put(flush_node); +} + +static u32 mlxsw_sp_router_xm_flush_mask4(u8 prefix_len) +{ + return GENMASK(31, 32 - prefix_len); +} + +static unsigned char *mlxsw_sp_router_xm_flush_mask6(u8 prefix_len) +{ + static unsigned char mask[sizeof(struct in6_addr)]; + + memset(mask, 0, sizeof(mask)); + memset(mask, 0xff, prefix_len / 8); + mask[prefix_len / 8] = GENMASK(8, 8 - prefix_len % 8); + return mask; +} + +#define MLXSW_SP_ROUTER_XM_CACHE_PARALLEL_FLUSHES_LIMIT 15 +#define MLXSW_SP_ROUTER_XM_CACHE_FLUSH_ALL_MIN_REUSES 15 +#define MLXSW_SP_ROUTER_XM_CACHE_DELAY 50 /* usecs */ +#define MLXSW_SP_ROUTER_XM_CACHE_MAX_WAIT (MLXSW_SP_ROUTER_XM_CACHE_DELAY * 10) + +static void mlxsw_sp_router_xm_cache_flush_work(struct work_struct *work) +{ + struct mlxsw_sp_router_xm_flush_info *flush_info; + struct mlxsw_sp_router_xm_flush_node *flush_node; + char rlcmld_pl[MLXSW_REG_RLCMLD_LEN]; + enum mlxsw_reg_rlcmld_select select; + struct mlxsw_sp *mlxsw_sp; + u32 addr4; + int err; + + flush_node = container_of(work, struct mlxsw_sp_router_xm_flush_node, + dw.work); + mlxsw_sp = flush_node->mlxsw_sp; + flush_info = &flush_node->flush_info; + + if (flush_info->all) { + char rlpmce_pl[MLXSW_REG_RLPMCE_LEN]; + + mlxsw_reg_rlpmce_pack(rlpmce_pl, true, false); + err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rlpmce), + rlpmce_pl); + if (err) + dev_err(mlxsw_sp->bus_info->dev, "Failed to flush XM cache\n"); + + if (flush_node->reuses < + MLXSW_SP_ROUTER_XM_CACHE_FLUSH_ALL_MIN_REUSES) + /* Leaving flush-all mode. */ + mlxsw_sp->router->xm->flush_all_mode = false; + goto out; + } + + select = MLXSW_REG_RLCMLD_SELECT_M_AND_ML_ENTRIES; + + switch (flush_info->proto) { + case MLXSW_SP_L3_PROTO_IPV4: + addr4 = *((u32 *) flush_info->addr); + addr4 &= mlxsw_sp_router_xm_flush_mask4(flush_info->prefix_len); + + /* In case the flush prefix length is bigger than M-value, + * it makes no sense to flush M entries. So just flush + * the ML entries. + */ + if (flush_info->prefix_len > MLXSW_SP_ROUTER_XM_M_VAL) + select = MLXSW_REG_RLCMLD_SELECT_ML_ENTRIES; + + mlxsw_reg_rlcmld_pack4(rlcmld_pl, select, + flush_info->virtual_router, addr4, + mlxsw_sp_router_xm_flush_mask4(flush_info->prefix_len)); + break; + case MLXSW_SP_L3_PROTO_IPV6: + mlxsw_reg_rlcmld_pack6(rlcmld_pl, select, + flush_info->virtual_router, flush_info->addr, + mlxsw_sp_router_xm_flush_mask6(flush_info->prefix_len)); + break; + default: + WARN_ON(true); + goto out; + } + err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rlcmld), rlcmld_pl); + if (err) + dev_err(mlxsw_sp->bus_info->dev, "Failed to flush XM cache\n"); + +out: + mlxsw_sp_router_xm_cache_flush_node_destroy(mlxsw_sp, flush_node); +} + +static bool +mlxsw_sp_router_xm_cache_flush_may_cancel(struct mlxsw_sp_router_xm_flush_node *flush_node) +{ + unsigned long max_wait = usecs_to_jiffies(MLXSW_SP_ROUTER_XM_CACHE_MAX_WAIT); + unsigned long delay = usecs_to_jiffies(MLXSW_SP_ROUTER_XM_CACHE_DELAY); + + /* In case there is the same flushing work pending, check + * if we can consolidate with it. We can do it up to MAX_WAIT. + * Cancel the delayed work. If the work was still pending. + */ + if (time_is_before_jiffies(flush_node->start_jiffies + max_wait - delay) && + cancel_delayed_work_sync(&flush_node->dw)) + return true; + return false; +} + +static int +mlxsw_sp_router_xm_cache_flush_schedule(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_router_xm_flush_info *flush_info) +{ + unsigned long delay = usecs_to_jiffies(MLXSW_SP_ROUTER_XM_CACHE_DELAY); + struct mlxsw_sp_router_xm_flush_info flush_all_info = {.all = true}; + struct mlxsw_sp_router_xm *router_xm = mlxsw_sp->router->xm; + struct mlxsw_sp_router_xm_flush_node *flush_node; + + /* Check if the queued number of flushes reached critical amount after + * which it is better to just flush the whole cache. + */ + if (router_xm->flush_count == MLXSW_SP_ROUTER_XM_CACHE_PARALLEL_FLUSHES_LIMIT) + /* Entering flush-all mode. */ + router_xm->flush_all_mode = true; + + if (router_xm->flush_all_mode) + flush_info = &flush_all_info; + + rcu_read_lock(); + flush_node = rhashtable_lookup_fast(&router_xm->flush_ht, flush_info, + mlxsw_sp_router_xm_flush_ht_params); + /* Take a reference so the object is not freed before possible + * delayed work cancel could be done. + */ + mlxsw_sp_router_xm_cache_flush_node_hold(flush_node); + rcu_read_unlock(); + + if (flush_node && mlxsw_sp_router_xm_cache_flush_may_cancel(flush_node)) { + flush_node->reuses++; + mlxsw_sp_router_xm_cache_flush_node_put(flush_node); + /* Original work was within wait period and was canceled. + * That means that the reference is still held and the + * flush_node_put() call above did not free the flush_node. + * Reschedule it with fresh delay. + */ + goto schedule_work; + } else { + mlxsw_sp_router_xm_cache_flush_node_put(flush_node); + } + + flush_node = mlxsw_sp_router_xm_cache_flush_node_create(mlxsw_sp, flush_info); + if (IS_ERR(flush_node)) + return PTR_ERR(flush_node); + INIT_DELAYED_WORK(&flush_node->dw, mlxsw_sp_router_xm_cache_flush_work); + +schedule_work: + mlxsw_core_schedule_dw(&flush_node->dw, delay); + return 0; +} + +static int +mlxsw_sp_router_xm_ml_entry_add(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_router_xm_fib_entry *fib_entry) +{ + struct mlxsw_sp_router_xm *router_xm = mlxsw_sp->router->xm; + struct mlxsw_sp_router_xm_ltable_node *ltable_node; + u8 lvalue = fib_entry->lvalue; + int err; + + ltable_node = mlxsw_sp_router_xm_ltable_node_get(router_xm, + fib_entry->mindex); + if (IS_ERR(ltable_node)) + return PTR_ERR(ltable_node); + if (lvalue > ltable_node->current_lvalue) { + /* The L-value is bigger then the one currently set, update. */ + ltable_node->current_lvalue = lvalue; + err = mlxsw_sp_router_xm_ltable_lvalue_set(mlxsw_sp, + ltable_node); + if (err) + goto err_lvalue_set; + + /* The L value for prefix/M is increased. + * Therefore, all entries in M and ML caches matching + * {prefix/M, proto, VR} need to be flushed. Set the flush + * prefix length to M to achieve that. + */ + fib_entry->flush_info.prefix_len = MLXSW_SP_ROUTER_XM_M_VAL; + } + + ltable_node->lvalue_ref[lvalue]++; + fib_entry->ltable_node = ltable_node; + + return 0; + +err_lvalue_set: + mlxsw_sp_router_xm_ltable_node_put(router_xm, ltable_node); + return err; +} + +static void +mlxsw_sp_router_xm_ml_entry_del(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_router_xm_fib_entry *fib_entry) +{ + struct mlxsw_sp_router_xm_ltable_node *ltable_node = + fib_entry->ltable_node; + struct mlxsw_sp_router_xm *router_xm = mlxsw_sp->router->xm; + u8 lvalue = fib_entry->lvalue; + + ltable_node->lvalue_ref[lvalue]--; + if (lvalue == ltable_node->current_lvalue && lvalue && + !ltable_node->lvalue_ref[lvalue]) { + u8 new_lvalue = lvalue - 1; + + /* Find the biggest L-value left out there. */ + while (new_lvalue > 0 && !ltable_node->lvalue_ref[lvalue]) + new_lvalue--; + + ltable_node->current_lvalue = new_lvalue; + mlxsw_sp_router_xm_ltable_lvalue_set(mlxsw_sp, ltable_node); + + /* The L value for prefix/M is decreased. + * Therefore, all entries in M and ML caches matching + * {prefix/M, proto, VR} need to be flushed. Set the flush + * prefix length to M to achieve that. + */ + fib_entry->flush_info.prefix_len = MLXSW_SP_ROUTER_XM_M_VAL; + } + mlxsw_sp_router_xm_ltable_node_put(router_xm, ltable_node); +} + +static int +mlxsw_sp_router_xm_ml_entries_add(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_fib_entry_op_ctx_xm *op_ctx_xm) +{ + struct mlxsw_sp_router_xm_fib_entry *fib_entry; + int err; + int i; + + for (i = 0; i < op_ctx_xm->entries_count; i++) { + fib_entry = op_ctx_xm->entries[i]; + err = mlxsw_sp_router_xm_ml_entry_add(mlxsw_sp, fib_entry); + if (err) + goto rollback; + } + return 0; + +rollback: + for (i--; i >= 0; i--) { + fib_entry = op_ctx_xm->entries[i]; + mlxsw_sp_router_xm_ml_entry_del(mlxsw_sp, fib_entry); + } + return err; +} + +static void +mlxsw_sp_router_xm_ml_entries_del(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_fib_entry_op_ctx_xm *op_ctx_xm) +{ + struct mlxsw_sp_router_xm_fib_entry *fib_entry; + int i; + + for (i = 0; i < op_ctx_xm->entries_count; i++) { + fib_entry = op_ctx_xm->entries[i]; + mlxsw_sp_router_xm_ml_entry_del(mlxsw_sp, fib_entry); + } +} + +static void +mlxsw_sp_router_xm_ml_entries_cache_flush(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_fib_entry_op_ctx_xm *op_ctx_xm) +{ + struct mlxsw_sp_router_xm_fib_entry *fib_entry; + int err; + int i; + + for (i = 0; i < op_ctx_xm->entries_count; i++) { + fib_entry = op_ctx_xm->entries[i]; + err = mlxsw_sp_router_xm_cache_flush_schedule(mlxsw_sp, + &fib_entry->flush_info); + if (err) + dev_err(mlxsw_sp->bus_info->dev, "Failed to flush XM cache\n"); + } +} + +static int mlxsw_sp_router_ll_xm_fib_entry_commit(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_fib_entry_op_ctx *op_ctx, + bool *postponed_for_bulk) +{ + struct mlxsw_sp_fib_entry_op_ctx_xm *op_ctx_xm = (void *) op_ctx->ll_priv; + struct mlxsw_sp_router_xm_fib_entry *fib_entry; + u8 num_rec; + int err; + int i; + + op_ctx_xm->trans_offset += op_ctx_xm->trans_item_len; + op_ctx_xm->entries_count++; + + /* Check if bulking is possible and there is still room for another + * FIB entry record. The size of 'trans_item_len' is either size of IPv4 + * command or size of IPv6 command. Not possible to mix those in a + * single XMDR write. + */ + if (op_ctx->bulk_ok && + op_ctx_xm->trans_offset + op_ctx_xm->trans_item_len <= MLXSW_REG_XMDR_TRANS_LEN) { + if (postponed_for_bulk) + *postponed_for_bulk = true; + return 0; + } + + if (op_ctx->event == FIB_EVENT_ENTRY_REPLACE) { + /* The L-table is updated inside. It has to be done before + * the prefix is inserted. + */ + err = mlxsw_sp_router_xm_ml_entries_add(mlxsw_sp, op_ctx_xm); + if (err) + goto out; + } + + err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(xmdr), op_ctx_xm->xmdr_pl); + if (err) + goto out; + num_rec = mlxsw_reg_xmdr_num_rec_get(op_ctx_xm->xmdr_pl); + if (num_rec > op_ctx_xm->entries_count) { + dev_err(mlxsw_sp->bus_info->dev, "Invalid XMDR number of records\n"); + err = -EIO; + goto out; + } + for (i = 0; i < num_rec; i++) { + if (!mlxsw_reg_xmdr_reply_vect_get(op_ctx_xm->xmdr_pl, i)) { + dev_err(mlxsw_sp->bus_info->dev, "Command send over XMDR failed\n"); + err = -EIO; + goto out; + } else { + fib_entry = op_ctx_xm->entries[i]; + fib_entry->committed = true; + } + } + + if (op_ctx->event == FIB_EVENT_ENTRY_DEL) + /* The L-table is updated inside. It has to be done after + * the prefix was removed. + */ + mlxsw_sp_router_xm_ml_entries_del(mlxsw_sp, op_ctx_xm); + + /* At the very end, do the XLT cache flushing to evict stale + * M and ML cache entries after prefixes were inserted/removed. + */ + mlxsw_sp_router_xm_ml_entries_cache_flush(mlxsw_sp, op_ctx_xm); + +out: + /* Next pack call is going to do reinitialization */ + op_ctx->initialized = false; + return err; +} + +static bool mlxsw_sp_router_ll_xm_fib_entry_is_committed(struct mlxsw_sp_fib_entry_priv *priv) +{ + struct mlxsw_sp_router_xm_fib_entry *fib_entry = (void *) priv->priv; + + return fib_entry->committed; +} + +const struct mlxsw_sp_router_ll_ops mlxsw_sp_router_ll_xm_ops = { + .init = mlxsw_sp_router_ll_xm_init, + .ralta_write = mlxsw_sp_router_ll_xm_ralta_write, + .ralst_write = mlxsw_sp_router_ll_xm_ralst_write, + .raltb_write = mlxsw_sp_router_ll_xm_raltb_write, + .fib_entry_op_ctx_size = sizeof(struct mlxsw_sp_fib_entry_op_ctx_xm), + .fib_entry_priv_size = sizeof(struct mlxsw_sp_router_xm_fib_entry), + .fib_entry_pack = mlxsw_sp_router_ll_xm_fib_entry_pack, + .fib_entry_act_remote_pack = mlxsw_sp_router_ll_xm_fib_entry_act_remote_pack, + .fib_entry_act_local_pack = mlxsw_sp_router_ll_xm_fib_entry_act_local_pack, + .fib_entry_act_ip2me_pack = mlxsw_sp_router_ll_xm_fib_entry_act_ip2me_pack, + .fib_entry_act_ip2me_tun_pack = mlxsw_sp_router_ll_xm_fib_entry_act_ip2me_tun_pack, + .fib_entry_commit = mlxsw_sp_router_ll_xm_fib_entry_commit, + .fib_entry_is_committed = mlxsw_sp_router_ll_xm_fib_entry_is_committed, +}; + +#define MLXSW_SP_ROUTER_XM_MINDEX_SIZE (64 * 1024) + +int mlxsw_sp_router_xm_init(struct mlxsw_sp *mlxsw_sp) +{ + struct mlxsw_sp_router_xm *router_xm; + char rxltm_pl[MLXSW_REG_RXLTM_LEN]; + char xltq_pl[MLXSW_REG_XLTQ_LEN]; + u32 mindex_size; + u16 device_id; + int err; + + if (!mlxsw_sp->bus_info->xm_exists) + return 0; + + router_xm = kzalloc(sizeof(*router_xm), GFP_KERNEL); + if (!router_xm) + return -ENOMEM; + + mlxsw_reg_xltq_pack(xltq_pl); + err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(xltq), xltq_pl); + if (err) + goto err_xltq_query; + mlxsw_reg_xltq_unpack(xltq_pl, &device_id, &router_xm->ipv4_supported, + &router_xm->ipv6_supported, &router_xm->entries_size, &mindex_size); + + if (device_id != MLXSW_REG_XLTQ_XM_DEVICE_ID_XLT) { + dev_err(mlxsw_sp->bus_info->dev, "Invalid XM device id\n"); + err = -EINVAL; + goto err_device_id_check; + } + + if (mindex_size != MLXSW_SP_ROUTER_XM_MINDEX_SIZE) { + dev_err(mlxsw_sp->bus_info->dev, "Unexpected M-index size\n"); + err = -EINVAL; + goto err_mindex_size_check; + } + + mlxsw_reg_rxltm_pack(rxltm_pl, mlxsw_sp_router_xm_m_val[MLXSW_SP_L3_PROTO_IPV4], + mlxsw_sp_router_xm_m_val[MLXSW_SP_L3_PROTO_IPV6]); + err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rxltm), rxltm_pl); + if (err) + goto err_rxltm_write; + + err = rhashtable_init(&router_xm->ltable_ht, &mlxsw_sp_router_xm_ltable_ht_params); + if (err) + goto err_ltable_ht_init; + + err = rhashtable_init(&router_xm->flush_ht, &mlxsw_sp_router_xm_flush_ht_params); + if (err) + goto err_flush_ht_init; + + mlxsw_sp->router->xm = router_xm; + return 0; + +err_flush_ht_init: + rhashtable_destroy(&router_xm->ltable_ht); +err_ltable_ht_init: +err_rxltm_write: +err_mindex_size_check: +err_device_id_check: +err_xltq_query: + kfree(router_xm); + return err; +} + +void mlxsw_sp_router_xm_fini(struct mlxsw_sp *mlxsw_sp) +{ + struct mlxsw_sp_router_xm *router_xm = mlxsw_sp->router->xm; + + if (!mlxsw_sp->bus_info->xm_exists) + return; + + rhashtable_destroy(&router_xm->flush_ht); + rhashtable_destroy(&router_xm->ltable_ht); + kfree(router_xm); +} + +bool mlxsw_sp_router_xm_ipv4_is_supported(const struct mlxsw_sp *mlxsw_sp) +{ + struct mlxsw_sp_router_xm *router_xm = mlxsw_sp->router->xm; + + return router_xm && router_xm->ipv4_supported; +} diff --git a/drivers/net/mhi_net.c b/drivers/net/mhi_net.c index d3f92781314e..fa41d8c42f05 100644 --- a/drivers/net/mhi_net.c +++ b/drivers/net/mhi_net.c @@ -173,6 +173,7 @@ static void mhi_net_ul_callback(struct mhi_device *mhi_dev, { struct mhi_net_dev *mhi_netdev = dev_get_drvdata(&mhi_dev->dev); struct net_device *ndev = mhi_netdev->ndev; + struct mhi_device *mdev = mhi_netdev->mdev; struct sk_buff *skb = mhi_res->buf_addr; /* Hardware has consumed the buffer, so free the skb (which is not @@ -196,7 +197,7 @@ static void mhi_net_ul_callback(struct mhi_device *mhi_dev, } u64_stats_update_end(&mhi_netdev->stats.tx_syncp); - if (netif_queue_stopped(ndev)) + if (netif_queue_stopped(ndev) && !mhi_queue_is_full(mdev, DMA_TO_DEVICE)) netif_wake_queue(ndev); } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 73c346e503d7..19499b7bb82c 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -416,12 +416,11 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) unsigned int napi_id; struct socket *sock; struct sock *sk; - int err; if (!net_busy_loop_on()) return; - sock = sock_from_file(epi->ffd.file, &err); + sock = sock_from_file(epi->ffd.file); if (!sock) return; diff --git a/fs/io_uring.c b/fs/io_uring.c index a2a7c65a77aa..a97488c7b2cf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4356,9 +4356,9 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, unsigned flags; int ret; - sock = sock_from_file(req->file, &ret); + sock = sock_from_file(req->file); if (unlikely(!sock)) - return ret; + return -ENOTSOCK; if (req->async_data) { kmsg = req->async_data; @@ -4405,9 +4405,9 @@ static int io_send(struct io_kiocb *req, bool force_nonblock, unsigned flags; int ret; - sock = sock_from_file(req->file, &ret); + sock = sock_from_file(req->file); if (unlikely(!sock)) - return ret; + return -ENOTSOCK; ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); if (unlikely(ret)) @@ -4585,9 +4585,9 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, unsigned flags; int ret, cflags = 0; - sock = sock_from_file(req->file, &ret); + sock = sock_from_file(req->file); if (unlikely(!sock)) - return ret; + return -ENOTSOCK; if (req->async_data) { kmsg = req->async_data; @@ -4648,9 +4648,9 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, unsigned flags; int ret, cflags = 0; - sock = sock_from_file(req->file, &ret); + sock = sock_from_file(req->file); if (unlikely(!sock)) - return ret; + return -ENOTSOCK; if (req->flags & REQ_F_BUFFER_SELECT) { kbuf = io_recv_buffer_select(req, !force_nonblock); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d05e75ed8c1b..07cb5d15e743 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1859,6 +1859,7 @@ extern const struct bpf_func_proto bpf_snprintf_btf_proto; extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; +extern const struct bpf_func_proto bpf_sock_from_file_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/net.h b/include/linux/net.h index 0dcd51feef02..9e2324efc26a 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -240,7 +240,7 @@ int sock_sendmsg(struct socket *sock, struct msghdr *msg); int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags); struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname); struct socket *sockfd_lookup(int fd, int *err); -struct socket *sock_from_file(struct file *file, int *err); +struct socket *sock_from_file(struct file *file); #define sockfd_put(sock) fput(sock->file) int net_ratelimit(void); diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h index 563457fec557..ba77f47ef61e 100644 --- a/include/net/inet_ecn.h +++ b/include/net/inet_ecn.h @@ -8,6 +8,7 @@ #include <net/inet_sock.h> #include <net/dsfield.h> +#include <net/checksum.h> enum { INET_ECN_NOT_ECT = 0, @@ -75,8 +76,8 @@ static inline void INET_ECN_dontxmit(struct sock *sk) static inline int IP_ECN_set_ce(struct iphdr *iph) { - u32 check = (__force u32)iph->check; u32 ecn = (iph->tos + 1) & INET_ECN_MASK; + __be16 check_add; /* * After the last operation we have (in binary): @@ -93,23 +94,20 @@ static inline int IP_ECN_set_ce(struct iphdr *iph) * INET_ECN_ECT_1 => check += htons(0xFFFD) * INET_ECN_ECT_0 => check += htons(0xFFFE) */ - check += (__force u16)htons(0xFFFB) + (__force u16)htons(ecn); + check_add = (__force __be16)((__force u16)htons(0xFFFB) + + (__force u16)htons(ecn)); - iph->check = (__force __sum16)(check + (check>=0xFFFF)); + iph->check = csum16_add(iph->check, check_add); iph->tos |= INET_ECN_CE; return 1; } static inline int IP_ECN_set_ect1(struct iphdr *iph) { - u32 check = (__force u32)iph->check; - if ((iph->tos & INET_ECN_MASK) != INET_ECN_ECT_0) return 0; - check += (__force u16)htons(0x1); - - iph->check = (__force __sum16)(check + (check>=0xFFFF)); + iph->check = csum16_add(iph->check, htons(0x1)); iph->tos ^= INET_ECN_MASK; return 1; } diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 22bc07f4b043..dc20a47e3828 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -151,9 +151,6 @@ struct net { #endif struct sock *nfnl; struct sock *nfnl_stash; -#if IS_ENABLED(CONFIG_NETFILTER_NETLINK_ACCT) - struct list_head nfnl_acct_list; -#endif #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) struct list_head nfct_timeout_list; #endif diff --git a/include/net/netfilter/ipv4/nf_reject.h b/include/net/netfilter/ipv4/nf_reject.h index 0d8ff84a2588..c653fcb88354 100644 --- a/include/net/netfilter/ipv4/nf_reject.h +++ b/include/net/netfilter/ipv4/nf_reject.h @@ -8,8 +8,8 @@ #include <net/netfilter/nf_reject.h> void nf_send_unreach(struct sk_buff *skb_in, int code, int hook); -void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook); - +void nf_send_reset(struct net *net, struct sock *, struct sk_buff *oldskb, + int hook); const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, struct tcphdr *_oth, int hook); struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, diff --git a/include/net/netfilter/ipv6/nf_reject.h b/include/net/netfilter/ipv6/nf_reject.h index edcf6d1cd316..d729344ba644 100644 --- a/include/net/netfilter/ipv6/nf_reject.h +++ b/include/net/netfilter/ipv6/nf_reject.h @@ -7,9 +7,8 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, unsigned char code, unsigned int hooknum); - -void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook); - +void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, + int hook); const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, struct tcphdr *otcph, unsigned int *otcplen, int hook); diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 9be7320b994f..96f9cf81f46b 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -32,7 +32,7 @@ struct nf_conntrack_l4proto { /* convert protoinfo to nfnetink attributes */ int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla, - struct nf_conn *ct); + struct nf_conn *ct, bool destroy); /* convert nfnetlink attributes to protoinfo */ int (*from_nlattr)(struct nlattr *tb[], struct nf_conn *ct); diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index c1c0a4ff92ae..f4af8362d234 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -305,8 +305,33 @@ struct nft_set_estimate { enum nft_set_class space; }; +#define NFT_EXPR_MAXATTR 16 +#define NFT_EXPR_SIZE(size) (sizeof(struct nft_expr) + \ + ALIGN(size, __alignof__(struct nft_expr))) + +/** + * struct nft_expr - nf_tables expression + * + * @ops: expression ops + * @data: expression private data + */ +struct nft_expr { + const struct nft_expr_ops *ops; + unsigned char data[] + __attribute__((aligned(__alignof__(u64)))); +}; + +static inline void *nft_expr_priv(const struct nft_expr *expr) +{ + return (void *)expr->data; +} + +int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src); +void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); +int nft_expr_dump(struct sk_buff *skb, unsigned int attr, + const struct nft_expr *expr); + struct nft_set_ext; -struct nft_expr; /** * struct nft_set_ops - nf_tables set operations @@ -396,6 +421,22 @@ struct nft_set_type { }; #define to_set_type(o) container_of(o, struct nft_set_type, ops) +struct nft_set_elem_expr { + u8 size; + unsigned char data[] + __attribute__((aligned(__alignof__(struct nft_expr)))); +}; + +#define nft_setelem_expr_at(__elem_expr, __offset) \ + ((struct nft_expr *)&__elem_expr->data[__offset]) + +#define nft_setelem_expr_foreach(__expr, __elem_expr, __size) \ + for (__expr = nft_setelem_expr_at(__elem_expr, 0), __size = 0; \ + __size < (__elem_expr)->size; \ + __size += (__expr)->ops->size, __expr = ((void *)(__expr)) + (__expr)->ops->size) + +#define NFT_SET_EXPR_MAX 2 + /** * struct nft_set - nf_tables set instance * @@ -448,13 +489,14 @@ struct nft_set { u16 policy; u16 udlen; unsigned char *udata; - struct nft_expr *expr; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; u16 flags:14, genmask:2; u8 klen; u8 dlen; + u8 num_exprs; + struct nft_expr *exprs[NFT_SET_EXPR_MAX]; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; @@ -519,7 +561,7 @@ void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set); * @NFT_SET_EXT_TIMEOUT: element timeout * @NFT_SET_EXT_EXPIRATION: element expiration time * @NFT_SET_EXT_USERDATA: user data associated with the element - * @NFT_SET_EXT_EXPR: expression assiociated with the element + * @NFT_SET_EXT_EXPRESSIONS: expressions assiciated with the element * @NFT_SET_EXT_OBJREF: stateful object reference associated with element * @NFT_SET_EXT_NUM: number of extension types */ @@ -531,7 +573,7 @@ enum nft_set_extensions { NFT_SET_EXT_TIMEOUT, NFT_SET_EXT_EXPIRATION, NFT_SET_EXT_USERDATA, - NFT_SET_EXT_EXPR, + NFT_SET_EXT_EXPRESSIONS, NFT_SET_EXT_OBJREF, NFT_SET_EXT_NUM }; @@ -649,9 +691,9 @@ static inline struct nft_userdata *nft_set_ext_userdata(const struct nft_set_ext return nft_set_ext(ext, NFT_SET_EXT_USERDATA); } -static inline struct nft_expr *nft_set_ext_expr(const struct nft_set_ext *ext) +static inline struct nft_set_elem_expr *nft_set_ext_expr(const struct nft_set_ext *ext) { - return nft_set_ext(ext, NFT_SET_EXT_EXPR); + return nft_set_ext(ext, NFT_SET_EXT_EXPRESSIONS); } static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) @@ -794,7 +836,6 @@ struct nft_offload_ctx; * @validate: validate expression, called during loop detection * @data: extra data to attach to this expression operation */ -struct nft_expr; struct nft_expr_ops { void (*eval)(const struct nft_expr *expr, struct nft_regs *regs, @@ -830,32 +871,6 @@ struct nft_expr_ops { void *data; }; -#define NFT_EXPR_MAXATTR 16 -#define NFT_EXPR_SIZE(size) (sizeof(struct nft_expr) + \ - ALIGN(size, __alignof__(struct nft_expr))) - -/** - * struct nft_expr - nf_tables expression - * - * @ops: expression ops - * @data: expression private data - */ -struct nft_expr { - const struct nft_expr_ops *ops; - unsigned char data[] - __attribute__((aligned(__alignof__(u64)))); -}; - -static inline void *nft_expr_priv(const struct nft_expr *expr) -{ - return (void *)expr->data; -} - -int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src); -void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); -int nft_expr_dump(struct sk_buff *skb, unsigned int attr, - const struct nft_expr *expr); - /** * struct nft_rule - nf_tables rule * @@ -908,11 +923,17 @@ static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, struct nft_regs *regs, const struct nft_pktinfo *pkt) { + struct nft_set_elem_expr *elem_expr; struct nft_expr *expr; - - if (__nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) { - expr = nft_set_ext_expr(ext); - expr->ops->eval(expr, regs, pkt); + u32 size; + + if (__nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) { + elem_expr = nft_set_ext_expr(ext); + nft_setelem_expr_foreach(expr, elem_expr, size) { + expr->ops->eval(expr, regs, pkt); + if (regs->verdict.code == NFT_BREAK) + return; + } } } diff --git a/include/net/tcp.h b/include/net/tcp.h index a62fb7f8a1e3..78d13c88720f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -611,7 +611,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, /* tcp_input.c */ void tcp_rearm_rto(struct sock *sk); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); -void tcp_reset(struct sock *sk); +void tcp_reset(struct sock *sk, struct sk_buff *skb); void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb); void tcp_fin(struct sock *sk); @@ -1326,7 +1326,9 @@ static inline unsigned long tcp_probe0_base(const struct sock *sk) static inline unsigned long tcp_probe0_when(const struct sock *sk, unsigned long max_when) { - u64 when = (u64)tcp_probe0_base(sk) << inet_csk(sk)->icsk_backoff; + u8 backoff = min_t(u8, ilog2(TCP_RTO_MAX / TCP_RTO_MIN) + 1, + inet_csk(sk)->icsk_backoff); + u64 when = (u64)tcp_probe0_base(sk) << backoff; return (unsigned long)min_t(u64, when, max_when); } diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index cd24e8a59529..76a97176ab81 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -145,17 +145,17 @@ DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err, TP_ARGS(dev, xdp, tgt, err, map, index) ); -#define _trace_xdp_redirect(dev, xdp, to) \ - trace_xdp_redirect(dev, xdp, NULL, 0, NULL, to); +#define _trace_xdp_redirect(dev, xdp, to) \ + trace_xdp_redirect(dev, xdp, NULL, 0, NULL, to) -#define _trace_xdp_redirect_err(dev, xdp, to, err) \ - trace_xdp_redirect_err(dev, xdp, NULL, err, NULL, to); +#define _trace_xdp_redirect_err(dev, xdp, to, err) \ + trace_xdp_redirect_err(dev, xdp, NULL, err, NULL, to) #define _trace_xdp_redirect_map(dev, xdp, to, map, index) \ - trace_xdp_redirect(dev, xdp, to, 0, map, index); + trace_xdp_redirect(dev, xdp, to, 0, map, index) #define _trace_xdp_redirect_map_err(dev, xdp, to, map, index, err) \ - trace_xdp_redirect_err(dev, xdp, to, err, map, index); + trace_xdp_redirect_err(dev, xdp, to, err, map, index) /* not used anymore, but kept around so as not to break old programs */ DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7e9931b6c735..77d7c1bb2923 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3822,6 +3822,14 @@ union bpf_attr { * The **hash_algo** is returned on success, * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if * invalid arguments are passed. + * + * struct socket *bpf_sock_from_file(struct file *file) + * Description + * If the given file represents a socket, returns the associated + * socket. + * Return + * A pointer to a struct socket on success or NULL if the file is + * not a socket. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3986,6 +3994,7 @@ union bpf_attr { FN(bprm_opts_set), \ FN(ktime_get_coarse_ns), \ FN(ima_inode_hash), \ + FN(sock_from_file), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 98272cb5f617..28b6ee53305f 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -361,6 +361,7 @@ enum nft_set_field_attributes { * @NFTA_SET_OBJ_TYPE: stateful object type (NLA_U32: NFT_OBJECT_*) * @NFTA_SET_HANDLE: set handle (NLA_U64) * @NFTA_SET_EXPR: set expression (NLA_NESTED: nft_expr_attributes) + * @NFTA_SET_EXPRESSIONS: list of expressions (NLA_NESTED: nft_list_attributes) */ enum nft_set_attributes { NFTA_SET_UNSPEC, @@ -381,6 +382,7 @@ enum nft_set_attributes { NFTA_SET_OBJ_TYPE, NFTA_SET_HANDLE, NFTA_SET_EXPR, + NFTA_SET_EXPRESSIONS, __NFTA_SET_MAX }; #define NFTA_SET_MAX (__NFTA_SET_MAX - 1) @@ -406,6 +408,7 @@ enum nft_set_elem_flags { * @NFTA_SET_ELEM_EXPR: expression (NLA_NESTED: nft_expr_attributes) * @NFTA_SET_ELEM_OBJREF: stateful object reference (NLA_STRING) * @NFTA_SET_ELEM_KEY_END: closing key value (NLA_NESTED: nft_data) + * @NFTA_SET_ELEM_EXPRESSIONS: list of expressions (NLA_NESTED: nft_list_attributes) */ enum nft_set_elem_attributes { NFTA_SET_ELEM_UNSPEC, @@ -419,6 +422,7 @@ enum nft_set_elem_attributes { NFTA_SET_ELEM_PAD, NFTA_SET_ELEM_OBJREF, NFTA_SET_ELEM_KEY_END, + NFTA_SET_ELEM_EXPRESSIONS, __NFTA_SET_ELEM_MAX }; #define NFTA_SET_ELEM_MAX (__NFTA_SET_ELEM_MAX - 1) @@ -715,6 +719,7 @@ enum nft_dynset_flags { * @NFTA_DYNSET_TIMEOUT: timeout value for the new element (NLA_U64) * @NFTA_DYNSET_EXPR: expression (NLA_NESTED: nft_expr_attributes) * @NFTA_DYNSET_FLAGS: flags (NLA_U32) + * @NFTA_DYNSET_EXPRESSIONS: list of expressions (NLA_NESTED: nft_list_attributes) */ enum nft_dynset_attributes { NFTA_DYNSET_UNSPEC, @@ -727,6 +732,7 @@ enum nft_dynset_attributes { NFTA_DYNSET_EXPR, NFTA_DYNSET_PAD, NFTA_DYNSET_FLAGS, + NFTA_DYNSET_EXPRESSIONS, __NFTA_DYNSET_MAX, }; #define NFTA_DYNSET_MAX (__NFTA_DYNSET_MAX - 1) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index fe7a0733a63a..7e848200cd26 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -224,7 +224,7 @@ static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) { - return (struct htab_elem *) (htab->elems + i * htab->elem_size); + return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size); } static void htab_free_elems(struct bpf_htab *htab) @@ -280,7 +280,7 @@ static int prealloc_init(struct bpf_htab *htab) if (!htab_is_percpu(htab) && !htab_is_lru(htab)) num_entries += num_possible_cpus(); - htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries, + htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries, htab->map.numa_node); if (!htab->elems) return -ENOMEM; @@ -1412,7 +1412,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; void __user *uvalues = u64_to_user_ptr(attr->batch.values); void __user *ukeys = u64_to_user_ptr(attr->batch.keys); - void *ubatch = u64_to_user_ptr(attr->batch.in_batch); + void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size; struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0cd3cc2af9c1..287be337d5f6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2121,8 +2121,11 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (IS_ERR(attach_btf)) return -EINVAL; if (!btf_is_kernel(attach_btf)) { + /* attaching through specifying bpf_prog's BTF + * objects directly might be supported eventually + */ btf_put(attach_btf); - return -EINVAL; + return -ENOTSUPP; } } } else if (attr->attach_btf_id) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d43b30c92c7d..17270b8404f1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3767,7 +3767,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, goto mark; if (state->stack[spi].slot_type[0] == STACK_SPILL && - state->stack[spi].spilled_ptr.type == SCALAR_VALUE) { + (state->stack[spi].spilled_ptr.type == SCALAR_VALUE || + env->allow_ptr_leaks)) { __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) state->stack[spi].slot_type[j] = STACK_MISC; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index fb2fbcbf6de6..4be771df5549 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1758,6 +1758,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_tracing_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_tracing_proto; + case BPF_FUNC_sock_from_file: + return &bpf_sock_from_file_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/net/bridge/br.c b/net/bridge/br.c index 401eeb9142eb..1b169f8e7491 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -43,7 +43,10 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v if (event == NETDEV_REGISTER) { /* register of bridge completed, add sysfs entries */ - br_sysfs_addbr(dev); + err = br_sysfs_addbr(dev); + if (err) + return notifier_from_errno(err); + return NOTIFY_DONE; } } diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index e4d287afc2c9..ac5372121e60 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -18,6 +18,8 @@ config NFT_BRIDGE_META config NFT_BRIDGE_REJECT tristate "Netfilter nf_tables bridge reject support" depends on NFT_REJECT + depends on NF_REJECT_IPV4 + depends on NF_REJECT_IPV6 help Add support to reject packets. diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index a32037daa933..4edd033e899c 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -394,6 +394,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) * use the bpf_sk_storage_(get|delete) helper. */ switch (prog->expected_attach_type) { + case BPF_TRACE_ITER: case BPF_TRACE_RAW_TP: /* bpf_sk_storage has no trace point */ return true; diff --git a/net/core/filter.c b/net/core/filter.c index 77001a35768f..255aeee72402 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10413,6 +10413,24 @@ const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], }; +BPF_CALL_1(bpf_sock_from_file, struct file *, file) +{ + return (unsigned long)sock_from_file(file); +} + +BTF_ID_LIST(bpf_sock_from_file_btf_ids) +BTF_ID(struct, socket) +BTF_ID(struct, file) + +const struct bpf_func_proto bpf_sock_from_file_proto = { + .func = bpf_sock_from_file, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = &bpf_sock_from_file_btf_ids[0], + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_sock_from_file_btf_ids[1], +}; + static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id) { diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 41b24cd31562..b49c57d35a88 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -68,9 +68,8 @@ struct update_classid_context { static int update_classid_sock(const void *v, struct file *file, unsigned n) { - int err; struct update_classid_context *ctx = (void *)v; - struct socket *sock = sock_from_file(file, &err); + struct socket *sock = sock_from_file(file); if (sock) { spin_lock(&cgroup_sk_update_lock); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 9bd4cab7d510..99a431c56f23 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -220,8 +220,7 @@ static ssize_t write_priomap(struct kernfs_open_file *of, static int update_netprio(const void *v, struct file *file, unsigned n) { - int err; - struct socket *sock = sock_from_file(file, &err); + struct socket *sock = sock_from_file(file); if (sock) { spin_lock(&cgroup_sk_update_lock); sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index bfa5c9969393..f62cae3f75d8 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2018,6 +2018,12 @@ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) skb->csum = csum_block_sub(skb->csum, skb_checksum(skb, len, delta, 0), len); + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { + int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; + int offset = skb_checksum_start_offset(skb) + skb->csum_offset; + + if (offset + sizeof(__sum16) > hdlen) + return -EINVAL; } return __pskb_trim(skb, len); } diff --git a/net/core/sock.c b/net/core/sock.c index 4fd7e785f177..bbcd4b97eddd 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2827,14 +2827,8 @@ EXPORT_SYMBOL(sock_no_mmap); void __receive_sock(struct file *file) { struct socket *sock; - int error; - /* - * The resulting value of "error" is ignored here since we only - * need to take action when the file is a socket and testing - * "sock" for NULL is sufficient. - */ - sock = sock_from_file(file, &error); + sock = sock_from_file(file); if (sock) { sock_update_netprioidx(&sock->sk->sk_cgrp_data); sock_update_classid(&sock->sk->sk_cgrp_data); diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index e16b98ee6266..4b8840734762 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -56,7 +56,8 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par) nf_send_unreach(skb, ICMP_PKT_FILTERED, hook); break; case IPT_TCP_RESET: - nf_send_reset(xt_net(par), skb, hook); + nf_send_reset(xt_net(par), par->state->sk, skb, hook); + break; case IPT_ICMP_ECHOREPLY: /* Doesn't happen. */ break; diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 4109055cdea6..4eed5afca392 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -234,7 +234,8 @@ static int nf_reject_fill_skb_dst(struct sk_buff *skb_in) } /* Send RST reply */ -void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) +void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, + int hook) { struct net_device *br_indev __maybe_unused; struct sk_buff *nskb; @@ -267,8 +268,7 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, ip4_dst_hoplimit(skb_dst(nskb))); nf_reject_ip_tcphdr_put(nskb, oldskb, oth); - - if (ip_route_me_harder(net, nskb->sk, nskb, RTN_UNSPEC)) + if (ip_route_me_harder(net, sk, nskb, RTN_UNSPEC)) goto free_nskb; niph = ip_hdr(nskb); diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index e408f813f5d8..ff437e4ed6db 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -27,7 +27,8 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr, nf_send_unreach(pkt->skb, priv->icmp_code, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset(nft_net(pkt), pkt->skb, nft_hook(pkt)); + nf_send_reset(nft_net(pkt), pkt->xt.state->sk, pkt->skb, + nft_hook(pkt)); break; default: break; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d6ad3b5c38e7..48ee476aa031 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4218,10 +4218,13 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) } /* When we get a reset we do this. */ -void tcp_reset(struct sock *sk) +void tcp_reset(struct sock *sk, struct sk_buff *skb) { trace_tcp_receive_reset(sk); + if (sk_is_mptcp(sk)) + mptcp_incoming_options(sk, skb); + /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->sk_state) { case TCP_SYN_SENT: @@ -5604,7 +5607,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, &tp->last_oow_ack_time)) tcp_send_dupack(sk, skb); } else if (tcp_reset_check(sk, skb)) { - tcp_reset(sk); + tcp_reset(sk, skb); } goto discard; } @@ -5640,7 +5643,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, } if (rst_seq_match) - tcp_reset(sk); + tcp_reset(sk, skb); else { /* Disable TFO if RST is out-of-order * and no data has been received @@ -6077,7 +6080,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, */ if (th->rst) { - tcp_reset(sk); + tcp_reset(sk, skb); goto discard; } @@ -6519,7 +6522,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); - tcp_reset(sk); + tcp_reset(sk, skb); return 1; } } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 495dda2449fe..0055ae0a3bf8 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -801,7 +801,7 @@ embryonic_reset: req->rsk_ops->send_reset(sk, skb); } else if (fastopen) { /* received a valid RST pkt */ reqsk_fastopen_remove(sk, req, true); - tcp_reset(sk); + tcp_reset(sk, skb); } if (!fastopen) { inet_csk_reqsk_queue_drop(sk, req); diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index 3ac5485049f0..a35019d2e480 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -61,7 +61,7 @@ reject_tg6(struct sk_buff *skb, const struct xt_action_param *par) /* Do nothing */ break; case IP6T_TCP_RESET: - nf_send_reset6(net, skb, xt_hooknum(par)); + nf_send_reset6(net, par->state->sk, skb, xt_hooknum(par)); break; case IP6T_ICMP6_POLICY_FAIL: nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, xt_hooknum(par)); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index aa35e6e37c1f..570d1d76c44d 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -275,7 +275,8 @@ static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in) return 0; } -void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) +void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, + int hook) { struct net_device *br_indev __maybe_unused; struct sk_buff *nskb; @@ -367,7 +368,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) dev_queue_xmit(nskb); } else #endif - ip6_local_out(net, nskb->sk, nskb); + ip6_local_out(net, sk, nskb); } EXPORT_SYMBOL_GPL(nf_send_reset6); diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c index c1098a1968e1..7969d1f3018d 100644 --- a/net/ipv6/netfilter/nft_reject_ipv6.c +++ b/net/ipv6/netfilter/nft_reject_ipv6.c @@ -28,7 +28,8 @@ static void nft_reject_ipv6_eval(const struct nft_expr *expr, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset6(nft_net(pkt), pkt->skb, nft_hook(pkt)); + nf_send_reset6(nft_net(pkt), pkt->xt.state->sk, pkt->skb, + nft_hook(pkt)); break; default: break; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 1ca60d9da3ef..5e7d7755d1a6 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -282,6 +282,16 @@ static void mptcp_parse_option(const struct sk_buff *skb, pr_debug("RM_ADDR: id=%d", mp_opt->rm_id); break; + case MPTCPOPT_MP_FASTCLOSE: + if (opsize != TCPOLEN_MPTCP_FASTCLOSE) + break; + + ptr += 2; + mp_opt->rcvr_key = get_unaligned_be64(ptr); + ptr += 8; + mp_opt->fastclose = 1; + break; + default: break; } @@ -299,6 +309,7 @@ void mptcp_get_options(const struct sk_buff *skb, mp_opt->mp_join = 0; mp_opt->add_addr = 0; mp_opt->ahmac = 0; + mp_opt->fastclose = 0; mp_opt->port = 0; mp_opt->rm_addr = 0; mp_opt->dss = 0; @@ -942,6 +953,12 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return; + if (mp_opt.fastclose && + msk->local_key == mp_opt.rcvr_key) { + WRITE_ONCE(msk->rcv_fastclose, true); + mptcp_schedule_work((struct sock *)msk); + } + if (mp_opt.add_addr && add_addr_hmac_valid(msk, &mp_opt)) { struct mptcp_addr_info addr; diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 5151cfcd6962..a6d983d80576 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -135,7 +135,7 @@ select_local_address(const struct pm_nl_pernet *pernet, struct mptcp_pm_addr_entry *entry, *ret = NULL; rcu_read_lock(); - spin_lock_bh(&msk->join_list_lock); + __mptcp_flush_join_list(msk); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) continue; @@ -144,13 +144,11 @@ select_local_address(const struct pm_nl_pernet *pernet, * pending join */ if (entry->addr.family == ((struct sock *)msk)->sk_family && - !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) && - !lookup_subflow_by_saddr(&msk->join_list, &entry->addr)) { + !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) { ret = entry; break; } } - spin_unlock_bh(&msk->join_list_lock); rcu_read_unlock(); return ret; } @@ -867,13 +865,14 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) return ret; } -static void __flush_addrs(struct pm_nl_pernet *pernet) +static void __flush_addrs(struct net *net, struct list_head *list) { - while (!list_empty(&pernet->local_addr_list)) { + while (!list_empty(list)) { struct mptcp_pm_addr_entry *cur; - cur = list_entry(pernet->local_addr_list.next, + cur = list_entry(list->next, struct mptcp_pm_addr_entry, list); + mptcp_nl_remove_subflow_and_signal_addr(net, &cur->addr); list_del_rcu(&cur->list); kfree_rcu(cur, rcu); } @@ -890,11 +889,13 @@ static void __reset_counters(struct pm_nl_pernet *pernet) static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info) { struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + LIST_HEAD(free_list); spin_lock_bh(&pernet->lock); - __flush_addrs(pernet); + list_splice_init(&pernet->local_addr_list, &free_list); __reset_counters(pernet); spin_unlock_bh(&pernet->lock); + __flush_addrs(sock_net(skb->sk), &free_list); return 0; } @@ -1156,10 +1157,12 @@ static void __net_exit pm_nl_exit_net(struct list_head *net_list) struct net *net; list_for_each_entry(net, net_list, exit_list) { + struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id); + /* net is removed from namespace list, can't race with * other modifiers */ - __flush_addrs(net_generic(net, pm_nl_pernet_id)); + __flush_addrs(net, &pernet->local_addr_list); } } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2540d82742ac..b812aaae8044 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1256,6 +1256,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_ext *mpext = NULL; struct sk_buff *skb, *tail; bool can_collapse = false; + int size_bias = 0; int avail_size; size_t ret = 0; @@ -1277,10 +1278,12 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, mpext = skb_ext_find(skb, SKB_EXT_MPTCP); can_collapse = (info->size_goal - skb->len > 0) && mptcp_skb_can_collapse_to(data_seq, skb, mpext); - if (!can_collapse) + if (!can_collapse) { TCP_SKB_CB(skb)->eor = 1; - else + } else { + size_bias = skb->len; avail_size = info->size_goal - skb->len; + } } /* Zero window and all data acked? Probe. */ @@ -1300,8 +1303,8 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, return 0; ret = info->limit - info->sent; - tail = tcp_build_frag(ssk, avail_size, info->flags, dfrag->page, - dfrag->offset + info->sent, &ret); + tail = tcp_build_frag(ssk, avail_size + size_bias, info->flags, + dfrag->page, dfrag->offset + info->sent, &ret); if (!tail) { tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk)); return -ENOMEM; @@ -1310,8 +1313,9 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, /* if the tail skb is still the cached one, collapsing really happened. */ if (skb == tail) { - WARN_ON_ONCE(!can_collapse); + TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH; mpext->data_len += ret; + WARN_ON_ONCE(!can_collapse); WARN_ON_ONCE(zero_window_probe); goto out; } @@ -2217,6 +2221,36 @@ static bool mptcp_check_close_timeout(const struct sock *sk) return true; } +static void mptcp_check_fastclose(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow, *tmp; + struct sock *sk = &msk->sk.icsk_inet.sk; + + if (likely(!READ_ONCE(msk->rcv_fastclose))) + return; + + mptcp_token_destroy(msk); + + list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(tcp_sk); + if (tcp_sk->sk_state != TCP_CLOSE) { + tcp_send_active_reset(tcp_sk, GFP_ATOMIC); + tcp_set_state(tcp_sk, TCP_CLOSE); + } + release_sock(tcp_sk); + } + + inet_sk_state_store(sk, TCP_CLOSE); + sk->sk_shutdown = SHUTDOWN_MASK; + smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ + set_bit(MPTCP_DATA_READY, &msk->flags); + set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); + + mptcp_close_wake_up(sk); +} + static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); @@ -2233,6 +2267,9 @@ static void mptcp_worker(struct work_struct *work) mptcp_check_data_fin_ack(sk); __mptcp_flush_join_list(msk); + + mptcp_check_fastclose(msk); + if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) __mptcp_close_subflow(msk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index f6c3c686a34a..7cf9d110b85f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -23,6 +23,7 @@ #define OPTION_MPTCP_ADD_ADDR BIT(6) #define OPTION_MPTCP_ADD_ADDR6 BIT(7) #define OPTION_MPTCP_RM_ADDR BIT(8) +#define OPTION_MPTCP_FASTCLOSE BIT(9) /* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 @@ -58,6 +59,7 @@ #define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 24 #define TCPOLEN_MPTCP_PORT_LEN 4 #define TCPOLEN_MPTCP_RM_ADDR_BASE 4 +#define TCPOLEN_MPTCP_FASTCLOSE 12 /* MPTCP MP_JOIN flags */ #define MPTCPOPT_BACKUP BIT(0) @@ -110,6 +112,7 @@ struct mptcp_options_received { u16 data_len; u16 mp_capable : 1, mp_join : 1, + fastclose : 1, dss : 1, add_addr : 1, rm_addr : 1, @@ -119,7 +122,7 @@ struct mptcp_options_received { u32 token; u32 nonce; u64 thmac; - u8 hmac[20]; + u8 hmac[MPTCPOPT_HMAC_LEN]; u8 join_id; u8 use_map:1, dsn64:1, @@ -237,6 +240,7 @@ struct mptcp_sock { bool fully_established; bool rcv_data_fin; bool snd_data_fin_enable; + bool rcv_fastclose; bool use_64bit_ack; /* Set when we received a 64-bit DSN */ spinlock_t join_list_lock; struct sock *ack_hint; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index fefcaf497938..73e66a406d99 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -313,12 +313,17 @@ void mptcp_subflow_reset(struct sock *ssk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; + /* must hold: tcp_done() could drop last reference on parent */ + sock_hold(sk); + tcp_set_state(ssk, TCP_CLOSE); tcp_send_active_reset(ssk, GFP_ATOMIC); tcp_done(ssk); if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) && schedule_work(&mptcp_sk(sk)->work)) - sock_hold(sk); + return; /* worker will put sk for us */ + + sock_put(sk); } static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) @@ -1167,6 +1172,30 @@ failed: return err; } +static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) +{ +#ifdef CONFIG_SOCK_CGROUP_DATA + struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data, + *child_skcd = &child->sk_cgrp_data; + + /* only the additional subflows created by kworkers have to be modified */ + if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != + cgroup_id(sock_cgroup_ptr(child_skcd))) { +#ifdef CONFIG_MEMCG + struct mem_cgroup *memcg = parent->sk_memcg; + + mem_cgroup_sk_free(child); + if (memcg && css_tryget(&memcg->css)) + child->sk_memcg = memcg; +#endif /* CONFIG_MEMCG */ + + cgroup_sk_free(child_skcd); + *child_skcd = *parent_skcd; + cgroup_sk_clone(child_skcd); + } +#endif /* CONFIG_SOCK_CGROUP_DATA */ +} + int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) { struct mptcp_subflow_context *subflow; @@ -1187,6 +1216,9 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) lock_sock(sf->sk); + /* the newly created socket has to be in the same cgroup as its parent */ + mptcp_attach_cgroup(sk, sf->sk); + /* kernel sockets do not by default acquire net ref, but TCP timer * needs it. */ diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index c0b8215ab3d4..54e086c65721 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -2137,7 +2137,7 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int if (cp->flags & IP_VS_CONN_F_ONE_PACKET) pkts = sysctl_sync_threshold(ipvs); else - pkts = atomic_add_return(1, &cp->in_pkts); + pkts = atomic_inc_return(&cp->in_pkts); if (ipvs->sync_state & IP_VS_STATE_MASTER) ip_vs_sync_conn(ipvs, cp, pkts); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 16b48064f715..9d43277b8b4f 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -615,7 +615,7 @@ static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, cp = cp->control; if (cp) { if (cp->flags & IP_VS_CONN_F_TEMPLATE) - pkts = atomic_add_return(1, &cp->in_pkts); + pkts = atomic_inc_return(&cp->in_pkts); else pkts = sysctl_sync_threshold(ipvs); ip_vs_sync_conn(ipvs, cp, pkts); @@ -776,7 +776,7 @@ control: if (!cp) return; if (cp->flags & IP_VS_CONN_F_TEMPLATE) - pkts = atomic_add_return(1, &cp->in_pkts); + pkts = atomic_inc_return(&cp->in_pkts); else pkts = sysctl_sync_threshold(ipvs); goto sloop; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 3d0fd33be018..84caf3316946 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -167,10 +167,14 @@ nla_put_failure: return -1; } -static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct, + bool skip_zero) { long timeout = nf_ct_expires(ct) / HZ; + if (skip_zero && timeout == 0) + return 0; + if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout))) goto nla_put_failure; return 0; @@ -179,7 +183,8 @@ nla_put_failure: return -1; } -static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct) +static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct, + bool destroy) { const struct nf_conntrack_l4proto *l4proto; struct nlattr *nest_proto; @@ -193,7 +198,7 @@ static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct) if (!nest_proto) goto nla_put_failure; - ret = l4proto->to_nlattr(skb, nest_proto, ct); + ret = l4proto->to_nlattr(skb, nest_proto, ct, destroy); nla_nest_end(skb, nest_proto); @@ -537,8 +542,8 @@ static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct) return -1; if (!test_bit(IPS_OFFLOAD_BIT, &ct->status) && - (ctnetlink_dump_timeout(skb, ct) < 0 || - ctnetlink_dump_protoinfo(skb, ct) < 0)) + (ctnetlink_dump_timeout(skb, ct, false) < 0 || + ctnetlink_dump_protoinfo(skb, ct, false) < 0)) return -1; return 0; @@ -780,15 +785,19 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) goto nla_put_failure; if (events & (1 << IPCT_DESTROY)) { + if (ctnetlink_dump_timeout(skb, ct, true) < 0) + goto nla_put_failure; + if (ctnetlink_dump_acct(skb, ct, type) < 0 || - ctnetlink_dump_timestamp(skb, ct) < 0) + ctnetlink_dump_timestamp(skb, ct) < 0 || + ctnetlink_dump_protoinfo(skb, ct, true) < 0) goto nla_put_failure; } else { - if (ctnetlink_dump_timeout(skb, ct) < 0) + if (ctnetlink_dump_timeout(skb, ct, false) < 0) goto nla_put_failure; - if (events & (1 << IPCT_PROTOINFO) - && ctnetlink_dump_protoinfo(skb, ct) < 0) + if (events & (1 << IPCT_PROTOINFO) && + ctnetlink_dump_protoinfo(skb, ct, false) < 0) goto nla_put_failure; if ((events & (1 << IPCT_HELPER) || nfct_help(ct)) @@ -2720,10 +2729,10 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) if (ctnetlink_dump_status(skb, ct) < 0) goto nla_put_failure; - if (ctnetlink_dump_timeout(skb, ct) < 0) + if (ctnetlink_dump_timeout(skb, ct, false) < 0) goto nla_put_failure; - if (ctnetlink_dump_protoinfo(skb, ct) < 0) + if (ctnetlink_dump_protoinfo(skb, ct, false) < 0) goto nla_put_failure; if (ctnetlink_dump_helpinfo(skb, ct) < 0) diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index b3f4a334f9d7..db7479db8512 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -589,7 +589,7 @@ static void dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct) #if IS_ENABLED(CONFIG_NF_CT_NETLINK) static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, - struct nf_conn *ct) + struct nf_conn *ct, bool destroy) { struct nlattr *nest_parms; @@ -597,15 +597,22 @@ static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP); if (!nest_parms) goto nla_put_failure; - if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state) || - nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE, + if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state)) + goto nla_put_failure; + + if (destroy) + goto skip_state; + + if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE, ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) || nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ, cpu_to_be64(ct->proto.dccp.handshake_seq), CTA_PROTOINFO_DCCP_PAD)) goto nla_put_failure; +skip_state: nla_nest_end(skb, nest_parms); spin_unlock_bh(&ct->lock); + return 0; nla_put_failure: diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 810cca24b399..fb8dc02e502f 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -543,7 +543,7 @@ static bool sctp_can_early_drop(const struct nf_conn *ct) #include <linux/netfilter/nfnetlink_conntrack.h> static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, - struct nf_conn *ct) + struct nf_conn *ct, bool destroy) { struct nlattr *nest_parms; @@ -552,15 +552,20 @@ static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, if (!nest_parms) goto nla_put_failure; - if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state) || - nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL, + if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state)) + goto nla_put_failure; + + if (destroy) + goto skip_state; + + if (nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL, ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]) || nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_REPLY, ct->proto.sctp.vtag[IP_CT_DIR_REPLY])) goto nla_put_failure; +skip_state: spin_unlock_bh(&ct->lock); - nla_nest_end(skb, nest_parms); return 0; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 811c6c9b59e1..1d7e1c595546 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1186,7 +1186,7 @@ static bool tcp_can_early_drop(const struct nf_conn *ct) #include <linux/netfilter/nfnetlink_conntrack.h> static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, - struct nf_conn *ct) + struct nf_conn *ct, bool destroy) { struct nlattr *nest_parms; struct nf_ct_tcp_flags tmp = {}; @@ -1196,8 +1196,13 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, if (!nest_parms) goto nla_put_failure; - if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state) || - nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, + if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state)) + goto nla_put_failure; + + if (destroy) + goto skip_state; + + if (nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, ct->proto.tcp.seen[0].td_scale) || nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY, ct->proto.tcp.seen[1].td_scale)) @@ -1212,8 +1217,8 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY, sizeof(struct nf_ct_tcp_flags), &tmp)) goto nla_put_failure; +skip_state: spin_unlock_bh(&ct->lock); - nla_nest_end(skb, nest_parms); return 0; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9d6c317878ff..8d5aa0ac45f4 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3571,6 +3571,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 }, [NFTA_SET_HANDLE] = { .type = NLA_U64 }, [NFTA_SET_EXPR] = { .type = NLA_NESTED }, + [NFTA_SET_EXPRESSIONS] = { .type = NLA_NESTED }, }; static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { @@ -3778,6 +3779,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, u32 portid = ctx->portid; struct nlattr *nest; u32 seq = ctx->seq; + int i; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), @@ -3846,11 +3848,22 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nla_nest_end(skb, nest); - if (set->expr) { + if (set->num_exprs == 1) { nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR); - if (nf_tables_fill_expr_info(skb, set->expr) < 0) + if (nf_tables_fill_expr_info(skb, set->exprs[0]) < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest); + } else if (set->num_exprs > 1) { + nest = nla_nest_start_noflag(skb, NFTA_SET_EXPRESSIONS); + if (nest == NULL) goto nla_put_failure; + for (i = 0; i < set->num_exprs; i++) { + if (nft_expr_dump(skb, NFTA_LIST_ELEM, + set->exprs[i]) < 0) + goto nla_put_failure; + } nla_nest_end(skb, nest); } @@ -4220,7 +4233,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, return err; } - if (nla[NFTA_SET_EXPR]) + if (nla[NFTA_SET_EXPR] || nla[NFTA_SET_EXPRESSIONS]) desc.expr = true; table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask); @@ -4284,6 +4297,31 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, err = PTR_ERR(expr); goto err_set_alloc_name; } + set->exprs[0] = expr; + set->num_exprs++; + } else if (nla[NFTA_SET_EXPRESSIONS]) { + struct nft_expr *expr; + struct nlattr *tmp; + int left; + + i = 0; + nla_for_each_nested(tmp, nla[NFTA_SET_EXPRESSIONS], left) { + if (i == NFT_SET_EXPR_MAX) { + err = -E2BIG; + goto err_set_init; + } + if (nla_type(tmp) != NFTA_LIST_ELEM) { + err = -EINVAL; + goto err_set_init; + } + expr = nft_set_elem_expr_alloc(&ctx, set, tmp); + if (IS_ERR(expr)) { + err = PTR_ERR(expr); + goto err_set_init; + } + set->exprs[i++] = expr; + set->num_exprs++; + } } udata = NULL; @@ -4301,7 +4339,6 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, set->dtype = dtype; set->objtype = objtype; set->dlen = desc.dlen; - set->expr = expr; set->flags = flags; set->size = desc.size; set->policy = policy; @@ -4330,8 +4367,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, err_set_trans: ops->destroy(set); err_set_init: - if (expr) - nft_expr_destroy(&ctx, expr); + for (i = 0; i < set->num_exprs; i++) + nft_expr_destroy(&ctx, set->exprs[i]); err_set_alloc_name: kfree(set->name); err_set_name: @@ -4341,11 +4378,13 @@ err_set_name: static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) { + int i; + if (WARN_ON(set->use > 0)) return; - if (set->expr) - nft_expr_destroy(ctx, set->expr); + for (i = 0; i < set->num_exprs; i++) + nft_expr_destroy(ctx, set->exprs[i]); set->ops->destroy(set); kfree(set->name); @@ -4498,8 +4537,8 @@ const struct nft_set_ext_type nft_set_ext_types[] = { [NFT_SET_EXT_DATA] = { .align = __alignof__(u32), }, - [NFT_SET_EXT_EXPR] = { - .align = __alignof__(struct nft_expr), + [NFT_SET_EXT_EXPRESSIONS] = { + .align = __alignof__(struct nft_set_elem_expr), }, [NFT_SET_EXT_OBJREF] = { .len = sizeof(struct nft_object *), @@ -4542,6 +4581,7 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING, .len = NFT_OBJ_MAXNAMELEN - 1 }, [NFTA_SET_ELEM_KEY_END] = { .type = NLA_NESTED }, + [NFTA_SET_ELEM_EXPRESSIONS] = { .type = NLA_NESTED }, }; static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { @@ -4575,6 +4615,43 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, return 0; } +static int nft_set_elem_expr_dump(struct sk_buff *skb, + const struct nft_set *set, + const struct nft_set_ext *ext) +{ + struct nft_set_elem_expr *elem_expr; + u32 size, num_exprs = 0; + struct nft_expr *expr; + struct nlattr *nest; + + elem_expr = nft_set_ext_expr(ext); + nft_setelem_expr_foreach(expr, elem_expr, size) + num_exprs++; + + if (num_exprs == 1) { + expr = nft_setelem_expr_at(elem_expr, 0); + if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr) < 0) + return -1; + + return 0; + } else if (num_exprs > 1) { + nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_EXPRESSIONS); + if (nest == NULL) + goto nla_put_failure; + + nft_setelem_expr_foreach(expr, elem_expr, size) { + expr = nft_setelem_expr_at(elem_expr, size); + if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr) < 0) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + } + return 0; + +nla_put_failure: + return -1; +} + static int nf_tables_fill_setelem(struct sk_buff *skb, const struct nft_set *set, const struct nft_set_elem *elem) @@ -4602,8 +4679,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, set->dlen) < 0) goto nla_put_failure; - if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR) && - nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, nft_set_ext_expr(ext)) < 0) + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS) && + nft_set_elem_expr_dump(skb, set, ext)) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) && @@ -5098,8 +5175,8 @@ void *nft_set_elem_init(const struct nft_set *set, return elem; } -static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, - struct nft_expr *expr) +static void __nft_set_elem_expr_destroy(const struct nft_ctx *ctx, + struct nft_expr *expr) { if (expr->ops->destroy_clone) { expr->ops->destroy_clone(ctx, expr); @@ -5109,6 +5186,16 @@ static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, } } +static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, + struct nft_set_elem_expr *elem_expr) +{ + struct nft_expr *expr; + u32 size; + + nft_setelem_expr_foreach(expr, elem_expr, size) + __nft_set_elem_expr_destroy(ctx, expr); +} + void nft_set_elem_destroy(const struct nft_set *set, void *elem, bool destroy_expr) { @@ -5121,7 +5208,7 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_release(nft_set_ext_data(ext), set->dtype); - if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) + if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) nft_set_elem_expr_destroy(&ctx, nft_set_ext_expr(ext)); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) @@ -5138,15 +5225,57 @@ static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, { struct nft_set_ext *ext = nft_set_elem_ext(set, elem); - if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext)); kfree(elem); } +static int nft_set_elem_expr_clone(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_expr *expr_array[]) +{ + struct nft_expr *expr; + int err, i, k; + + for (i = 0; i < set->num_exprs; i++) { + expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL); + if (!expr) + goto err_expr; + + err = nft_expr_clone(expr, set->exprs[i]); + if (err < 0) { + nft_expr_destroy(ctx, expr); + goto err_expr; + } + expr_array[i] = expr; + } + + return 0; + +err_expr: + for (k = i - 1; k >= 0; k++) + nft_expr_destroy(ctx, expr_array[i]); + + return -ENOMEM; +} + +static void nft_set_elem_expr_setup(const struct nft_set_ext *ext, int i, + struct nft_expr *expr_array[]) +{ + struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext); + struct nft_expr *expr = nft_setelem_expr_at(elem_expr, elem_expr->size); + + memcpy(expr, expr_array[i], expr_array[i]->ops->size); + elem_expr->size += expr_array[i]->ops->size; + kfree(expr_array[i]); + expr_array[i] = NULL; +} + static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr, u32 nlmsg_flags) { + struct nft_expr *expr_array[NFT_SET_EXPR_MAX] = {}; struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; u8 genmask = nft_genmask_next(ctx->net); struct nft_set_ext_tmpl tmpl; @@ -5154,16 +5283,15 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_set_elem elem; struct nft_set_binding *binding; struct nft_object *obj = NULL; - struct nft_expr *expr = NULL; struct nft_userdata *udata; struct nft_data_desc desc; enum nft_registers dreg; struct nft_trans *trans; - u32 flags = 0; + u32 flags = 0, size = 0; u64 timeout; u64 expiration; + int err, i; u8 ulen; - int err; err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, nft_set_elem_policy, NULL); @@ -5196,7 +5324,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nla[NFTA_SET_ELEM_TIMEOUT] || nla[NFTA_SET_ELEM_EXPIRATION] || nla[NFTA_SET_ELEM_USERDATA] || - nla[NFTA_SET_ELEM_EXPR])) + nla[NFTA_SET_ELEM_EXPR] || + nla[NFTA_SET_ELEM_EXPRESSIONS])) return -EINVAL; timeout = 0; @@ -5221,23 +5350,62 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; } - if (nla[NFTA_SET_ELEM_EXPR] != NULL) { + if (nla[NFTA_SET_ELEM_EXPR]) { + struct nft_expr *expr; + + if (set->num_exprs != 1) + return -EOPNOTSUPP; + expr = nft_set_elem_expr_alloc(ctx, set, nla[NFTA_SET_ELEM_EXPR]); if (IS_ERR(expr)) return PTR_ERR(expr); - err = -EOPNOTSUPP; - if (set->expr && set->expr->ops != expr->ops) + expr_array[0] = expr; + + if (set->exprs[0] && set->exprs[0]->ops != expr->ops) { + err = -EOPNOTSUPP; goto err_set_elem_expr; - } else if (set->expr) { - expr = kzalloc(set->expr->ops->size, GFP_KERNEL); - if (!expr) - return -ENOMEM; + } + } else if (nla[NFTA_SET_ELEM_EXPRESSIONS]) { + struct nft_expr *expr; + struct nlattr *tmp; + int left; - err = nft_expr_clone(expr, set->expr); - if (err < 0) + if (set->num_exprs == 0) + return -EOPNOTSUPP; + + i = 0; + nla_for_each_nested(tmp, nla[NFTA_SET_ELEM_EXPRESSIONS], left) { + if (i == set->num_exprs) { + err = -E2BIG; + goto err_set_elem_expr; + } + if (nla_type(tmp) != NFTA_LIST_ELEM) { + err = -EINVAL; + goto err_set_elem_expr; + } + expr = nft_set_elem_expr_alloc(ctx, set, tmp); + if (IS_ERR(expr)) { + err = PTR_ERR(expr); + goto err_set_elem_expr; + } + expr_array[i] = expr; + + if (expr->ops != set->exprs[i]->ops) { + err = -EOPNOTSUPP; + goto err_set_elem_expr; + } + i++; + } + if (set->num_exprs != i) { + err = -EOPNOTSUPP; goto err_set_elem_expr; + } + } else if (set->num_exprs > 0) { + err = nft_set_elem_expr_clone(ctx, set, expr_array); + if (err < 0) + goto err_set_elem_expr_clone; } err = nft_setelem_parse_key(ctx, set, &elem.key.val, @@ -5262,9 +5430,14 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); } - if (expr) - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPR, - expr->ops->size); + if (set->num_exprs) { + for (i = 0; i < set->num_exprs; i++) + size += expr_array[i]->ops->size; + + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPRESSIONS, + sizeof(struct nft_set_elem_expr) + + size); + } if (nla[NFTA_SET_ELEM_OBJREF] != NULL) { if (!(set->flags & NFT_SET_OBJECT)) { @@ -5346,11 +5519,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, *nft_set_ext_obj(ext) = obj; obj->use++; } - if (expr) { - memcpy(nft_set_ext_expr(ext), expr, expr->ops->size); - kfree(expr); - expr = NULL; - } + for (i = 0; i < set->num_exprs; i++) + nft_set_elem_expr_setup(ext, i, expr_array); trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); if (trans == NULL) @@ -5411,9 +5581,9 @@ err_parse_key_end: err_parse_key: nft_data_release(&elem.key.val, NFT_DATA_VALUE); err_set_elem_expr: - if (expr != NULL) - nft_expr_destroy(ctx, expr); - + for (i = 0; i < set->num_exprs && expr_array[i]; i++) + nft_expr_destroy(ctx, expr_array[i]); +err_set_elem_expr_clone: return err; } diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 5e511df8d709..0fa1653b5f19 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -16,6 +16,7 @@ #include <linux/errno.h> #include <net/netlink.h> #include <net/sock.h> +#include <net/netns/generic.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> @@ -41,6 +42,17 @@ struct nfacct_filter { u32 mask; }; +struct nfnl_acct_net { + struct list_head nfnl_acct_list; +}; + +static unsigned int nfnl_acct_net_id __read_mostly; + +static inline struct nfnl_acct_net *nfnl_acct_pernet(struct net *net) +{ + return net_generic(net, nfnl_acct_net_id); +} + #define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES) #define NFACCT_OVERQUOTA_BIT 2 /* NFACCT_F_OVERQUOTA */ @@ -49,6 +61,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl, const struct nlattr * const tb[], struct netlink_ext_ack *extack) { + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); struct nf_acct *nfacct, *matching = NULL; char *acct_name; unsigned int size = 0; @@ -61,7 +74,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl, if (strlen(acct_name) == 0) return -EINVAL; - list_for_each_entry(nfacct, &net->nfnl_acct_list, head) { + list_for_each_entry(nfacct, &nfnl_acct_net->nfnl_acct_list, head) { if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0) continue; @@ -123,7 +136,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl, be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS]))); } refcount_set(&nfacct->refcnt, 1); - list_add_tail_rcu(&nfacct->head, &net->nfnl_acct_list); + list_add_tail_rcu(&nfacct->head, &nfnl_acct_net->nfnl_acct_list); return 0; } @@ -188,6 +201,7 @@ static int nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); struct nf_acct *cur, *last; const struct nfacct_filter *filter = cb->data; @@ -199,7 +213,7 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->args[1] = 0; rcu_read_lock(); - list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) { + list_for_each_entry_rcu(cur, &nfnl_acct_net->nfnl_acct_list, head) { if (last) { if (cur != last) continue; @@ -269,6 +283,7 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl, const struct nlattr * const tb[], struct netlink_ext_ack *extack) { + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); int ret = -ENOENT; struct nf_acct *cur; char *acct_name; @@ -288,7 +303,7 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl, return -EINVAL; acct_name = nla_data(tb[NFACCT_NAME]); - list_for_each_entry(cur, &net->nfnl_acct_list, head) { + list_for_each_entry(cur, &nfnl_acct_net->nfnl_acct_list, head) { struct sk_buff *skb2; if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) @@ -342,19 +357,20 @@ static int nfnl_acct_del(struct net *net, struct sock *nfnl, const struct nlattr * const tb[], struct netlink_ext_ack *extack) { + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); struct nf_acct *cur, *tmp; int ret = -ENOENT; char *acct_name; if (!tb[NFACCT_NAME]) { - list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head) + list_for_each_entry_safe(cur, tmp, &nfnl_acct_net->nfnl_acct_list, head) nfnl_acct_try_del(cur); return 0; } acct_name = nla_data(tb[NFACCT_NAME]); - list_for_each_entry(cur, &net->nfnl_acct_list, head) { + list_for_each_entry(cur, &nfnl_acct_net->nfnl_acct_list, head) { if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0) continue; @@ -402,10 +418,11 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT); struct nf_acct *nfnl_acct_find_get(struct net *net, const char *acct_name) { + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); struct nf_acct *cur, *acct = NULL; rcu_read_lock(); - list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) { + list_for_each_entry_rcu(cur, &nfnl_acct_net->nfnl_acct_list, head) { if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) continue; @@ -488,16 +505,17 @@ EXPORT_SYMBOL_GPL(nfnl_acct_overquota); static int __net_init nfnl_acct_net_init(struct net *net) { - INIT_LIST_HEAD(&net->nfnl_acct_list); + INIT_LIST_HEAD(&nfnl_acct_pernet(net)->nfnl_acct_list); return 0; } static void __net_exit nfnl_acct_net_exit(struct net *net) { + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); struct nf_acct *cur, *tmp; - list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head) { + list_for_each_entry_safe(cur, tmp, &nfnl_acct_net->nfnl_acct_list, head) { list_del_rcu(&cur->head); if (refcount_dec_and_test(&cur->refcnt)) @@ -508,6 +526,8 @@ static void __net_exit nfnl_acct_net_exit(struct net *net) static struct pernet_operations nfnl_acct_ops = { .init = nfnl_acct_net_init, .exit = nfnl_acct_net_exit, + .id = &nfnl_acct_net_id, + .size = sizeof(struct nfnl_acct_net), }; static int __init nfnl_acct_init(void) diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 9af4f93c7f0e..983a1d5ca3ab 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -19,11 +19,30 @@ struct nft_dynset { enum nft_registers sreg_key:8; enum nft_registers sreg_data:8; bool invert; + u8 num_exprs; u64 timeout; - struct nft_expr *expr; + struct nft_expr *expr_array[NFT_SET_EXPR_MAX]; struct nft_set_binding binding; }; +static int nft_dynset_expr_setup(const struct nft_dynset *priv, + const struct nft_set_ext *ext) +{ + struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext); + struct nft_expr *expr; + int i; + + for (i = 0; i < priv->num_exprs; i++) { + expr = nft_setelem_expr_at(elem_expr, elem_expr->size); + if (nft_expr_clone(expr, priv->expr_array[i]) < 0) + return -1; + + elem_expr->size += priv->expr_array[i]->ops->size; + } + + return 0; +} + static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, struct nft_regs *regs) { @@ -44,8 +63,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, goto err1; ext = nft_set_elem_ext(set, elem); - if (priv->expr != NULL && - nft_expr_clone(nft_set_ext_expr(ext), priv->expr) < 0) + if (priv->num_exprs && nft_dynset_expr_setup(priv, ext) < 0) goto err2; return elem; @@ -90,6 +108,41 @@ void nft_dynset_eval(const struct nft_expr *expr, regs->verdict.code = NFT_BREAK; } +static void nft_dynset_ext_add_expr(struct nft_dynset *priv) +{ + u8 size = 0; + int i; + + for (i = 0; i < priv->num_exprs; i++) + size += priv->expr_array[i]->ops->size; + + nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_EXPRESSIONS, + sizeof(struct nft_set_elem_expr) + size); +} + +static struct nft_expr * +nft_dynset_expr_alloc(const struct nft_ctx *ctx, const struct nft_set *set, + const struct nlattr *attr, int pos) +{ + struct nft_expr *expr; + int err; + + expr = nft_set_elem_expr_alloc(ctx, set, attr); + if (IS_ERR(expr)) + return expr; + + if (set->exprs[pos] && set->exprs[pos]->ops != expr->ops) { + err = -EOPNOTSUPP; + goto err_dynset_expr; + } + + return expr; + +err_dynset_expr: + nft_expr_destroy(ctx, expr); + return ERR_PTR(err); +} + static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { [NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING, .len = NFT_SET_MAXNAMELEN - 1 }, @@ -100,6 +153,7 @@ static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { [NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 }, [NFTA_DYNSET_EXPR] = { .type = NLA_NESTED }, [NFTA_DYNSET_FLAGS] = { .type = NLA_U32 }, + [NFTA_DYNSET_EXPRESSIONS] = { .type = NLA_NESTED }, }; static int nft_dynset_init(const struct nft_ctx *ctx, @@ -110,7 +164,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, u8 genmask = nft_genmask_next(ctx->net); struct nft_set *set; u64 timeout; - int err; + int err, i; lockdep_assert_held(&ctx->net->nft.commit_mutex); @@ -181,16 +235,58 @@ static int nft_dynset_init(const struct nft_ctx *ctx, } else if (set->flags & NFT_SET_MAP) return -EINVAL; - if (tb[NFTA_DYNSET_EXPR] != NULL) { - if (!(set->flags & NFT_SET_EVAL)) - return -EINVAL; + if ((tb[NFTA_DYNSET_EXPR] || tb[NFTA_DYNSET_EXPRESSIONS]) && + !(set->flags & NFT_SET_EVAL)) + return -EINVAL; + + if (tb[NFTA_DYNSET_EXPR]) { + struct nft_expr *dynset_expr; + + dynset_expr = nft_dynset_expr_alloc(ctx, set, + tb[NFTA_DYNSET_EXPR], 0); + if (IS_ERR(dynset_expr)) + return PTR_ERR(dynset_expr); - priv->expr = nft_set_elem_expr_alloc(ctx, set, - tb[NFTA_DYNSET_EXPR]); - if (IS_ERR(priv->expr)) - return PTR_ERR(priv->expr); + priv->num_exprs++; + priv->expr_array[0] = dynset_expr; - if (set->expr && set->expr->ops != priv->expr->ops) { + if (set->num_exprs > 1 || + (set->num_exprs == 1 && + dynset_expr->ops != set->exprs[0]->ops)) { + err = -EOPNOTSUPP; + goto err_expr_free; + } + } else if (tb[NFTA_DYNSET_EXPRESSIONS]) { + struct nft_expr *dynset_expr; + struct nlattr *tmp; + int left; + + i = 0; + nla_for_each_nested(tmp, tb[NFTA_DYNSET_EXPRESSIONS], left) { + if (i == NFT_SET_EXPR_MAX) { + err = -E2BIG; + goto err_expr_free; + } + if (nla_type(tmp) != NFTA_LIST_ELEM) { + err = -EINVAL; + goto err_expr_free; + } + dynset_expr = nft_dynset_expr_alloc(ctx, set, tmp, i); + if (IS_ERR(dynset_expr)) { + err = PTR_ERR(dynset_expr); + goto err_expr_free; + } + priv->expr_array[i] = dynset_expr; + priv->num_exprs++; + + if (set->num_exprs && + dynset_expr->ops != set->exprs[i]->ops) { + err = -EOPNOTSUPP; + goto err_expr_free; + } + i++; + } + if (set->num_exprs && set->num_exprs != i) { err = -EOPNOTSUPP; goto err_expr_free; } @@ -200,9 +296,10 @@ static int nft_dynset_init(const struct nft_ctx *ctx, nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_KEY, set->klen); if (set->flags & NFT_SET_MAP) nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_DATA, set->dlen); - if (priv->expr != NULL) - nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_EXPR, - priv->expr->ops->size); + + if (priv->num_exprs) + nft_dynset_ext_add_expr(priv); + if (set->flags & NFT_SET_TIMEOUT) { if (timeout || set->timeout) nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_EXPIRATION); @@ -221,8 +318,8 @@ static int nft_dynset_init(const struct nft_ctx *ctx, return 0; err_expr_free: - if (priv->expr != NULL) - nft_expr_destroy(ctx, priv->expr); + for (i = 0; i < priv->num_exprs; i++) + nft_expr_destroy(ctx, priv->expr_array[i]); return err; } @@ -247,9 +344,10 @@ static void nft_dynset_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_dynset *priv = nft_expr_priv(expr); + int i; - if (priv->expr != NULL) - nft_expr_destroy(ctx, priv->expr); + for (i = 0; i < priv->num_exprs; i++) + nft_expr_destroy(ctx, priv->expr_array[i]); nf_tables_destroy_set(ctx, priv->set); } @@ -258,6 +356,7 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_dynset *priv = nft_expr_priv(expr); u32 flags = priv->invert ? NFT_DYNSET_F_INV : 0; + int i; if (nft_dump_register(skb, NFTA_DYNSET_SREG_KEY, priv->sreg_key)) goto nla_put_failure; @@ -272,8 +371,23 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) nf_jiffies64_to_msecs(priv->timeout), NFTA_DYNSET_PAD)) goto nla_put_failure; - if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr)) - goto nla_put_failure; + if (priv->num_exprs == 1) { + if (nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr_array[0])) + goto nla_put_failure; + } else if (priv->num_exprs > 1) { + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, NFTA_DYNSET_EXPRESSIONS); + if (!nest) + goto nla_put_failure; + + for (i = 0; i < priv->num_exprs; i++) { + if (nft_expr_dump(skb, NFTA_LIST_ELEM, + priv->expr_array[i])) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + } if (nla_put_be32(skb, NFTA_DYNSET_FLAGS, htonl(flags))) goto nla_put_failure; return 0; diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index 32f3ea398ddf..95090186ee90 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -28,7 +28,8 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset(nft_net(pkt), pkt->skb, nft_hook(pkt)); + nf_send_reset(nft_net(pkt), pkt->xt.state->sk, + pkt->skb, nft_hook(pkt)); break; case NFT_REJECT_ICMPX_UNREACH: nf_send_unreach(pkt->skb, @@ -44,7 +45,8 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, priv->icmp_code, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset6(nft_net(pkt), pkt->skb, nft_hook(pkt)); + nf_send_reset6(nft_net(pkt), pkt->xt.state->sk, + pkt->skb, nft_hook(pkt)); break; case NFT_REJECT_ICMPX_UNREACH: nf_send_unreach6(nft_net(pkt), pkt->skb, diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 4d3f147e8d8d..bf618b7ec1ae 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -293,6 +293,22 @@ cont: rhashtable_walk_exit(&hti); } +static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set, + struct nft_set_ext *ext) +{ + struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext); + struct nft_expr *expr; + u32 size; + + nft_setelem_expr_foreach(expr, elem_expr, size) { + if (expr->ops->gc && + expr->ops->gc(read_pnet(&set->net), expr)) + return true; + } + + return false; +} + static void nft_rhash_gc(struct work_struct *work) { struct nft_set *set; @@ -314,16 +330,13 @@ static void nft_rhash_gc(struct work_struct *work) continue; } - if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPR)) { - struct nft_expr *expr = nft_set_ext_expr(&he->ext); + if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPRESSIONS) && + nft_rhash_expr_needs_gc_run(set, &he->ext)) + goto needs_gc_run; - if (expr->ops->gc && - expr->ops->gc(read_pnet(&set->net), expr)) - goto gc; - } if (!nft_set_elem_expired(&he->ext)) continue; -gc: +needs_gc_run: if (nft_set_elem_mark_busy(&he->ext)) continue; diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index a97c2259bbc8..7c6bf1c16813 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -27,7 +27,7 @@ static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par) overquota = nfnl_acct_overquota(xt_net(par), info->nfacct); - return overquota == NFACCT_UNDERQUOTA ? false : true; + return overquota != NFACCT_UNDERQUOTA; } static int diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index c3a664871cb5..e8902a7e60f2 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -959,16 +959,13 @@ static int dec_ttl_exception_handler(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, bool last) { - /* The first action is always 'OVS_DEC_TTL_ATTR_ARG'. */ - struct nlattr *dec_ttl_arg = nla_data(attr); + /* The first attribute is always 'OVS_DEC_TTL_ATTR_ACTION'. */ + struct nlattr *actions = nla_data(attr); - if (nla_len(dec_ttl_arg)) { - struct nlattr *actions = nla_data(dec_ttl_arg); + if (nla_len(actions)) + return clone_execute(dp, skb, key, 0, nla_data(actions), + nla_len(actions), last, false); - if (actions) - return clone_execute(dp, skb, key, 0, nla_data(actions), - nla_len(actions), last, false); - } consume_skb(skb); return 0; } @@ -1212,7 +1209,7 @@ static int execute_dec_ttl(struct sk_buff *skb, struct sw_flow_key *key) return -EHOSTUNREACH; key->ip.ttl = --nh->hop_limit; - } else { + } else if (skb->protocol == htons(ETH_P_IP)) { struct iphdr *nh; u8 old_ttl; diff --git a/net/socket.c b/net/socket.c index bfef11ba35b8..9a240b45bdf3 100644 --- a/net/socket.c +++ b/net/socket.c @@ -445,17 +445,15 @@ static int sock_map_fd(struct socket *sock, int flags) /** * sock_from_file - Return the &socket bounded to @file. * @file: file - * @err: pointer to an error code return * - * On failure returns %NULL and assigns -ENOTSOCK to @err. + * On failure returns %NULL. */ -struct socket *sock_from_file(struct file *file, int *err) +struct socket *sock_from_file(struct file *file) { if (file->f_op == &socket_file_ops) return file->private_data; /* set in sock_map_fd */ - *err = -ENOTSOCK; return NULL; } EXPORT_SYMBOL(sock_from_file); @@ -484,9 +482,11 @@ struct socket *sockfd_lookup(int fd, int *err) return NULL; } - sock = sock_from_file(file, err); - if (!sock) + sock = sock_from_file(file); + if (!sock) { + *err = -ENOTSOCK; fput(file); + } return sock; } EXPORT_SYMBOL(sockfd_lookup); @@ -498,11 +498,12 @@ static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed) *err = -EBADF; if (f.file) { - sock = sock_from_file(f.file, err); + sock = sock_from_file(f.file); if (likely(sock)) { *fput_needed = f.flags & FDPUT_FPUT; return sock; } + *err = -ENOTSOCK; fdput(f); } return NULL; @@ -1693,9 +1694,11 @@ int __sys_accept4_file(struct file *file, unsigned file_flags, if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - sock = sock_from_file(file, &err); - if (!sock) + sock = sock_from_file(file); + if (!sock) { + err = -ENOTSOCK; goto out; + } err = -ENFILE; newsock = sock_alloc(); @@ -1818,9 +1821,11 @@ int __sys_connect_file(struct file *file, struct sockaddr_storage *address, struct socket *sock; int err; - sock = sock_from_file(file, &err); - if (!sock) + sock = sock_from_file(file); + if (!sock) { + err = -ENOTSOCK; goto out; + } err = security_socket_connect(sock, (struct sockaddr *)address, addrlen); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 310cfc68875a..ac4a317038f1 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -564,12 +564,12 @@ static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int fl struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); + if (unlikely(!xsk_is_bound(xs))) + return -ENXIO; if (unlikely(!(xs->dev->flags & IFF_UP))) return -ENETDOWN; if (unlikely(!xs->rx)) return -ENOBUFS; - if (unlikely(!xsk_is_bound(xs))) - return -ENXIO; if (unlikely(need_wait)) return -EOPNOTSUPP; diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 568f9815bb1b..db0cb73513a5 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -1275,6 +1275,8 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size) while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) < batch_size) { complete_tx_only(xsk, batch_size); + if (benchmark_done) + return; } for (i = 0; i < batch_size; i++) { diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 8b829748d488..867ada23281c 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -437,6 +437,8 @@ class PrinterHelpers(Printer): 'struct path', 'struct btf_ptr', 'struct inode', + 'struct socket', + 'struct file', ] known_types = { '...', @@ -482,6 +484,8 @@ class PrinterHelpers(Printer): 'struct path', 'struct btf_ptr', 'struct inode', + 'struct socket', + 'struct file', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7e9931b6c735..77d7c1bb2923 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3822,6 +3822,14 @@ union bpf_attr { * The **hash_algo** is returned on success, * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if * invalid arguments are passed. + * + * struct socket *bpf_sock_from_file(struct file *file) + * Description + * If the given file represents a socket, returns the associated + * socket. + * Return + * A pointer to a struct socket on success or NULL if the file is + * not a socket. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3986,6 +3994,7 @@ union bpf_attr { FN(bprm_opts_set), \ FN(ktime_get_coarse_ns), \ FN(ima_inode_hash), \ + FN(sock_from_file), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 9be88a90a4aa..6ae748f6ea11 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2518,7 +2518,7 @@ static int bpf_object__finalize_btf(struct bpf_object *obj) return 0; } -static inline bool libbpf_prog_needs_vmlinux_btf(struct bpf_program *prog) +static bool prog_needs_vmlinux_btf(struct bpf_program *prog) { if (prog->type == BPF_PROG_TYPE_STRUCT_OPS || prog->type == BPF_PROG_TYPE_LSM) @@ -2533,37 +2533,43 @@ static inline bool libbpf_prog_needs_vmlinux_btf(struct bpf_program *prog) return false; } -static int bpf_object__load_vmlinux_btf(struct bpf_object *obj) +static bool obj_needs_vmlinux_btf(const struct bpf_object *obj) { - bool need_vmlinux_btf = false; struct bpf_program *prog; - int i, err; + int i; /* CO-RE relocations need kernel BTF */ if (obj->btf_ext && obj->btf_ext->core_relo_info.len) - need_vmlinux_btf = true; + return true; /* Support for typed ksyms needs kernel BTF */ for (i = 0; i < obj->nr_extern; i++) { const struct extern_desc *ext; ext = &obj->externs[i]; - if (ext->type == EXT_KSYM && ext->ksym.type_id) { - need_vmlinux_btf = true; - break; - } + if (ext->type == EXT_KSYM && ext->ksym.type_id) + return true; } bpf_object__for_each_program(prog, obj) { if (!prog->load) continue; - if (libbpf_prog_needs_vmlinux_btf(prog)) { - need_vmlinux_btf = true; - break; - } + if (prog_needs_vmlinux_btf(prog)) + return true; } - if (!need_vmlinux_btf) + return false; +} + +static int bpf_object__load_vmlinux_btf(struct bpf_object *obj, bool force) +{ + int err; + + /* btf_vmlinux could be loaded earlier */ + if (obj->btf_vmlinux) + return 0; + + if (!force && !obj_needs_vmlinux_btf(obj)) return 0; obj->btf_vmlinux = libbpf_find_kernel_btf(); @@ -7475,7 +7481,7 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr) } err = bpf_object__probe_loading(obj); - err = err ? : bpf_object__load_vmlinux_btf(obj); + err = err ? : bpf_object__load_vmlinux_btf(obj, false); err = err ? : bpf_object__resolve_externs(obj, obj->kconfig); err = err ? : bpf_object__sanitize_and_load_btf(obj); err = err ? : bpf_object__sanitize_maps(obj); @@ -10870,23 +10876,33 @@ int bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd, const char *attach_func_name) { - int btf_id; + int btf_obj_fd = 0, btf_id = 0, err; if (!prog || attach_prog_fd < 0 || !attach_func_name) return -EINVAL; - if (attach_prog_fd) + if (prog->obj->loaded) + return -EINVAL; + + if (attach_prog_fd) { btf_id = libbpf_find_prog_btf_id(attach_func_name, attach_prog_fd); - else - btf_id = libbpf_find_vmlinux_btf_id(attach_func_name, - prog->expected_attach_type); - - if (btf_id < 0) - return btf_id; + if (btf_id < 0) + return btf_id; + } else { + /* load btf_vmlinux, if not yet */ + err = bpf_object__load_vmlinux_btf(prog->obj, true); + if (err) + return err; + err = find_kernel_btf_id(prog->obj, attach_func_name, + prog->expected_attach_type, + &btf_obj_fd, &btf_id); + if (err) + return err; + } prog->attach_btf_id = btf_id; - prog->attach_btf_obj_fd = 0; + prog->attach_btf_obj_fd = btf_obj_fd; prog->attach_prog_fd = attach_prog_fd; return 0; } diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 6909ee81113a..3c35eb401931 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -536,6 +536,7 @@ LIBBPF_API int ring_buffer__add(struct ring_buffer *rb, int map_fd, ring_buffer_sample_fn sample_cb, void *ctx); LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms); LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb); +LIBBPF_API int ring_buffer__epoll_fd(const struct ring_buffer *rb); /* Perf buffer APIs */ struct perf_buffer; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 7c4126542e2b..1c0fd2dd233a 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -346,6 +346,7 @@ LIBBPF_0.3.0 { btf__parse_split; btf__new_empty_split; btf__new_split; + ring_buffer__epoll_fd; xsk_setup_xdp_prog; xsk_socket__update_xskmap; } LIBBPF_0.2.0; diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index 98537ff2679e..8caaafe7e312 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -282,3 +282,9 @@ int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms) } return cnt < 0 ? -errno : res; } + +/* Get an fd that can be used to sleep until data is available in the ring(s) */ +int ring_buffer__epoll_fd(const struct ring_buffer *rb) +{ + return rb->epoll_fd; +} diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 752d8edddc66..f5b7ef93618c 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -36,3 +36,4 @@ test_cpp /runqslower /bench *.ko +xdpxceiver diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index ac25ba5d0d6c..8c33e999319a 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -19,7 +19,6 @@ ifneq ($(wildcard $(GENHDR)),) endif CLANG ?= clang -LLC ?= llc LLVM_OBJCOPY ?= llvm-objcopy BPF_GCC ?= $(shell command -v bpf-gcc;) SAN_CFLAGS ?= @@ -46,7 +45,8 @@ endif TEST_GEN_FILES = TEST_FILES = test_lwt_ip_encap.o \ - test_tc_edt.o + test_tc_edt.o \ + xsk_prereqs.sh # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ @@ -70,17 +70,17 @@ TEST_PROGS := test_kmod.sh \ test_bpftool_build.sh \ test_bpftool.sh \ test_bpftool_metadata.sh \ + test_xsk.sh TEST_PROGS_EXTENDED := with_addr.sh \ with_tunnels.sh \ - tcp_client.py \ - tcp_server.py \ test_xdp_vlan.sh # Compile but not part of 'make run_tests' TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ - test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko + test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ + xdpxceiver TEST_CUSTOM_PROGS = urandom_read @@ -115,6 +115,13 @@ INCLUDE_DIR := $(SCRATCH_DIR)/include BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a RESOLVE_BTFIDS := $(BUILD_DIR)/resolve_btfids/resolve_btfids +VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ + $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ + ../../../../vmlinux \ + /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(shell uname -r) +VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) + # Define simple and short `make test_progs`, `make test_sysctl`, etc targets # to build individual tests. # NOTE: Semicolon at the end is critical to override lib.mk's default static @@ -139,6 +146,7 @@ $(OUTPUT)/urandom_read: urandom_read.c $(OUTPUT)/bpf_testmod.ko: $(VMLINUX_BTF) $(wildcard bpf_testmod/Makefile bpf_testmod/*.[ch]) $(call msg,MOD,,$@) + $(Q)$(RM) bpf_testmod/bpf_testmod.ko # force re-compilation $(Q)$(MAKE) $(submake_extras) -C bpf_testmod $(Q)cp bpf_testmod/bpf_testmod.ko $@ @@ -146,13 +154,6 @@ $(OUTPUT)/test_stub.o: test_stub.c $(BPFOBJ) $(call msg,CC,,$@) $(Q)$(CC) -c $(CFLAGS) -o $@ $< -VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ - $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ - ../../../../vmlinux \ - /sys/kernel/btf/vmlinux \ - /boot/vmlinux-$(shell uname -r) -VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) - DEFAULT_BPFTOOL := $(SCRATCH_DIR)/sbin/bpftool $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) @@ -251,31 +252,19 @@ $(OUTPUT)/flow_dissector_load.o: flow_dissector_load.h # $1 - input .c file # $2 - output .o file # $3 - CFLAGS -# $4 - LDFLAGS define CLANG_BPF_BUILD_RULE - $(call msg,CLNG-LLC,$(TRUNNER_BINARY),$2) - $(Q)($(CLANG) $3 -O2 -target bpf -emit-llvm \ - -c $1 -o - || echo "BPF obj compilation failed") | \ - $(LLC) -mattr=dwarfris -march=bpf -mcpu=v3 $4 -filetype=obj -o $2 + $(call msg,CLNG-BPF,$(TRUNNER_BINARY),$2) + $(Q)$(CLANG) $3 -O2 -target bpf -c $1 -o $2 -mcpu=v3 endef # Similar to CLANG_BPF_BUILD_RULE, but with disabled alu32 define CLANG_NOALU32_BPF_BUILD_RULE - $(call msg,CLNG-LLC,$(TRUNNER_BINARY),$2) - $(Q)($(CLANG) $3 -O2 -target bpf -emit-llvm \ - -c $1 -o - || echo "BPF obj compilation failed") | \ - $(LLC) -march=bpf -mcpu=v2 $4 -filetype=obj -o $2 -endef -# Similar to CLANG_BPF_BUILD_RULE, but using native Clang and bpf LLC -define CLANG_NATIVE_BPF_BUILD_RULE $(call msg,CLNG-BPF,$(TRUNNER_BINARY),$2) - $(Q)($(CLANG) $3 -O2 -emit-llvm \ - -c $1 -o - || echo "BPF obj compilation failed") | \ - $(LLC) -march=bpf -mcpu=v3 $4 -filetype=obj -o $2 + $(Q)$(CLANG) $3 -O2 -target bpf -c $1 -o $2 -mcpu=v2 endef # Build BPF object using GCC define GCC_BPF_BUILD_RULE $(call msg,GCC-BPF,$(TRUNNER_BINARY),$2) - $(Q)$(BPF_GCC) $3 $4 -O2 -c $1 -o $2 + $(Q)$(BPF_GCC) $3 -O2 -c $1 -o $2 endef SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c @@ -330,8 +319,7 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $$(INCLUDE_DIR)/vmlinux.h \ $(wildcard $(BPFDIR)/bpf_*.h) | $(TRUNNER_OUTPUT) $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ - $(TRUNNER_BPF_CFLAGS), \ - $(TRUNNER_BPF_LDFLAGS)) + $(TRUNNER_BPF_CFLAGS)) $(TRUNNER_BPF_SKELS): $(TRUNNER_OUTPUT)/%.skel.h: \ $(TRUNNER_OUTPUT)/%.o \ @@ -399,19 +387,16 @@ TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \ $(wildcard progs/btf_dump_test_case_*.c) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) -TRUNNER_BPF_LDFLAGS := -mattr=+alu32 $(eval $(call DEFINE_TEST_RUNNER,test_progs)) # Define test_progs-no_alu32 test runner. TRUNNER_BPF_BUILD_RULE := CLANG_NOALU32_BPF_BUILD_RULE -TRUNNER_BPF_LDFLAGS := $(eval $(call DEFINE_TEST_RUNNER,test_progs,no_alu32)) # Define test_progs BPF-GCC-flavored test runner. ifneq ($(BPF_GCC),) TRUNNER_BPF_BUILD_RULE := GCC_BPF_BUILD_RULE TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(call get_sys_includes,gcc) -TRUNNER_BPF_LDFLAGS := $(eval $(call DEFINE_TEST_RUNNER,test_progs,bpf_gcc)) endif @@ -422,7 +407,6 @@ TRUNNER_EXTRA_SOURCES := test_maps.c TRUNNER_EXTRA_FILES := TRUNNER_BPF_BUILD_RULE := $$(error no BPF objects should be built) TRUNNER_BPF_CFLAGS := -TRUNNER_BPF_LDFLAGS := $(eval $(call DEFINE_TEST_RUNNER,test_maps)) # Define test_verifier test runner. diff --git a/tools/testing/selftests/bpf/ima_setup.sh b/tools/testing/selftests/bpf/ima_setup.sh index 2bfc646bc230..8e62581113a3 100755 --- a/tools/testing/selftests/bpf/ima_setup.sh +++ b/tools/testing/selftests/bpf/ima_setup.sh @@ -7,6 +7,8 @@ set -o pipefail IMA_POLICY_FILE="/sys/kernel/security/ima/policy" TEST_BINARY="/bin/true" +VERBOSE="${SELFTESTS_VERBOSE:=0}" +LOG_FILE="$(mktemp /tmp/ima_setup.XXXX.log)" usage() { @@ -75,6 +77,19 @@ run() exec "${copied_bin_path}" } +catch() +{ + local exit_code="$1" + local log_file="$2" + + if [[ "${exit_code}" -ne 0 ]]; then + cat "${log_file}" >&3 + fi + + rm -f "${log_file}" + exit ${exit_code} +} + main() { [[ $# -ne 2 ]] && usage @@ -96,4 +111,13 @@ main() fi } +trap 'catch "$?" "${LOG_FILE}"' EXIT + +if [[ "${VERBOSE}" -eq 0 ]]; then + # Save the stderr to 3 so that we can output back to + # it incase of an error. + exec 3>&2 1>"${LOG_FILE}" 2>&1 +fi + main "$@" +rm -f "${LOG_FILE}" diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c index 448885b95eed..0e586368948d 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -20,6 +20,7 @@ #include "bpf_iter_bpf_percpu_hash_map.skel.h" #include "bpf_iter_bpf_array_map.skel.h" #include "bpf_iter_bpf_percpu_array_map.skel.h" +#include "bpf_iter_bpf_sk_storage_helpers.skel.h" #include "bpf_iter_bpf_sk_storage_map.skel.h" #include "bpf_iter_test_kern5.skel.h" #include "bpf_iter_test_kern6.skel.h" @@ -913,6 +914,119 @@ out: bpf_iter_bpf_percpu_array_map__destroy(skel); } +/* An iterator program deletes all local storage in a map. */ +static void test_bpf_sk_storage_delete(void) +{ + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + struct bpf_iter_bpf_sk_storage_helpers *skel; + union bpf_iter_link_info linfo; + int err, len, map_fd, iter_fd; + struct bpf_link *link; + int sock_fd = -1; + __u32 val = 42; + char buf[64]; + + skel = bpf_iter_bpf_sk_storage_helpers__open_and_load(); + if (CHECK(!skel, "bpf_iter_bpf_sk_storage_helpers__open_and_load", + "skeleton open_and_load failed\n")) + return; + + map_fd = bpf_map__fd(skel->maps.sk_stg_map); + + sock_fd = socket(AF_INET6, SOCK_STREAM, 0); + if (CHECK(sock_fd < 0, "socket", "errno: %d\n", errno)) + goto out; + err = bpf_map_update_elem(map_fd, &sock_fd, &val, BPF_NOEXIST); + if (CHECK(err, "map_update", "map_update failed\n")) + goto out; + + memset(&linfo, 0, sizeof(linfo)); + linfo.map.map_fd = map_fd; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + link = bpf_program__attach_iter(skel->progs.delete_bpf_sk_storage_map, + &opts); + if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + goto out; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n")) + goto free_link; + + /* do some tests */ + while ((len = read(iter_fd, buf, sizeof(buf))) > 0) + ; + if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno))) + goto close_iter; + + /* test results */ + err = bpf_map_lookup_elem(map_fd, &sock_fd, &val); + if (CHECK(!err || errno != ENOENT, "bpf_map_lookup_elem", + "map value wasn't deleted (err=%d, errno=%d)\n", err, errno)) + goto close_iter; + +close_iter: + close(iter_fd); +free_link: + bpf_link__destroy(link); +out: + if (sock_fd >= 0) + close(sock_fd); + bpf_iter_bpf_sk_storage_helpers__destroy(skel); +} + +/* This creates a socket and its local storage. It then runs a task_iter BPF + * program that replaces the existing socket local storage with the tgid of the + * only task owning a file descriptor to this socket, this process, prog_tests. + * It then runs a tcp socket iterator that negates the value in the existing + * socket local storage, the test verifies that the resulting value is -pid. + */ +static void test_bpf_sk_storage_get(void) +{ + struct bpf_iter_bpf_sk_storage_helpers *skel; + int err, map_fd, val = -1; + int sock_fd = -1; + + skel = bpf_iter_bpf_sk_storage_helpers__open_and_load(); + if (CHECK(!skel, "bpf_iter_bpf_sk_storage_helpers__open_and_load", + "skeleton open_and_load failed\n")) + return; + + sock_fd = socket(AF_INET6, SOCK_STREAM, 0); + if (CHECK(sock_fd < 0, "socket", "errno: %d\n", errno)) + goto out; + + err = listen(sock_fd, 1); + if (CHECK(err != 0, "listen", "errno: %d\n", errno)) + goto close_socket; + + map_fd = bpf_map__fd(skel->maps.sk_stg_map); + + err = bpf_map_update_elem(map_fd, &sock_fd, &val, BPF_NOEXIST); + if (CHECK(err, "bpf_map_update_elem", "map_update_failed\n")) + goto close_socket; + + do_dummy_read(skel->progs.fill_socket_owner); + + err = bpf_map_lookup_elem(map_fd, &sock_fd, &val); + if (CHECK(err || val != getpid(), "bpf_map_lookup_elem", + "map value wasn't set correctly (expected %d, got %d, err=%d)\n", + getpid(), val, err)) + goto close_socket; + + do_dummy_read(skel->progs.negate_socket_local_storage); + + err = bpf_map_lookup_elem(map_fd, &sock_fd, &val); + CHECK(err || val != -getpid(), "bpf_map_lookup_elem", + "map value wasn't set correctly (expected %d, got %d, err=%d)\n", + -getpid(), val, err); + +close_socket: + close(sock_fd); +out: + bpf_iter_bpf_sk_storage_helpers__destroy(skel); +} + static void test_bpf_sk_storage_map(void) { DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); @@ -1067,6 +1181,10 @@ void test_bpf_iter(void) test_bpf_percpu_array_map(); if (test__start_subtest("bpf_sk_storage_map")) test_bpf_sk_storage_map(); + if (test__start_subtest("bpf_sk_storage_delete")) + test_bpf_sk_storage_delete(); + if (test__start_subtest("bpf_sk_storage_get")) + test_bpf_sk_storage_get(); if (test__start_subtest("rdonly-buf-out-of-bound")) test_rdonly_buf_out_of_bound(); if (test__start_subtest("buf-neg-offset")) diff --git a/tools/testing/selftests/bpf/prog_tests/module_attach.c b/tools/testing/selftests/bpf/prog_tests/module_attach.c index 4b65e9918764..50796b651f72 100644 --- a/tools/testing/selftests/bpf/prog_tests/module_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/module_attach.c @@ -28,10 +28,18 @@ void test_module_attach(void) struct test_module_attach__bss *bss; int err; - skel = test_module_attach__open_and_load(); + skel = test_module_attach__open(); if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) return; + err = bpf_program__set_attach_target(skel->progs.handle_fentry_manual, + 0, "bpf_testmod_test_read"); + ASSERT_OK(err, "set_attach_target"); + + err = test_module_attach__load(skel); + if (CHECK(err, "skel_load", "failed to load skeleton\n")) + return; + bss = skel->bss; err = test_module_attach__attach(skel); @@ -44,6 +52,7 @@ void test_module_attach(void) ASSERT_EQ(bss->raw_tp_read_sz, READ_SZ, "raw_tp"); ASSERT_EQ(bss->tp_btf_read_sz, READ_SZ, "tp_btf"); ASSERT_EQ(bss->fentry_read_sz, READ_SZ, "fentry"); + ASSERT_EQ(bss->fentry_manual_read_sz, READ_SZ, "fentry_manual"); ASSERT_EQ(bss->fexit_read_sz, READ_SZ, "fexit"); ASSERT_EQ(bss->fexit_ret, -EIO, "fexit_tet"); ASSERT_EQ(bss->fmod_ret_read_sz, READ_SZ, "fmod_ret"); diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_helpers.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_helpers.c new file mode 100644 index 000000000000..6cecab2b32ba --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_helpers.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Google LLC. */ +#include "bpf_iter.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, int); +} sk_stg_map SEC(".maps"); + +SEC("iter/bpf_sk_storage_map") +int delete_bpf_sk_storage_map(struct bpf_iter__bpf_sk_storage_map *ctx) +{ + if (ctx->sk) + bpf_sk_storage_delete(&sk_stg_map, ctx->sk); + + return 0; +} + +SEC("iter/task_file") +int fill_socket_owner(struct bpf_iter__task_file *ctx) +{ + struct task_struct *task = ctx->task; + struct file *file = ctx->file; + struct socket *sock; + int *sock_tgid; + + if (!task || !file) + return 0; + + sock = bpf_sock_from_file(file); + if (!sock) + return 0; + + sock_tgid = bpf_sk_storage_get(&sk_stg_map, sock->sk, 0, 0); + if (!sock_tgid) + return 0; + + *sock_tgid = task->tgid; + + return 0; +} + +SEC("iter/tcp") +int negate_socket_local_storage(struct bpf_iter__tcp *ctx) +{ + struct sock_common *sk_common = ctx->sk_common; + int *sock_tgid; + + if (!sk_common) + return 0; + + sock_tgid = bpf_sk_storage_get(&sk_stg_map, sk_common, 0, 0); + if (!sock_tgid) + return 0; + + *sock_tgid = -*sock_tgid; + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task.c b/tools/testing/selftests/bpf/progs/bpf_iter_task.c index 4983087852a0..b7f32c160f4e 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task.c @@ -11,9 +11,10 @@ int dump_task(struct bpf_iter__task *ctx) { struct seq_file *seq = ctx->meta->seq; struct task_struct *task = ctx->task; + static char info[] = " === END ==="; if (task == (void *)0) { - BPF_SEQ_PRINTF(seq, " === END ===\n"); + BPF_SEQ_PRINTF(seq, "%s\n", info); return 0; } diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_module.c b/tools/testing/selftests/bpf/progs/test_core_reloc_module.c index 56363959f7b0..f59f175c7baf 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_module.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_module.c @@ -40,6 +40,7 @@ int BPF_PROG(test_core_module_probed, struct task_struct *task, struct bpf_testmod_test_read_ctx *read_ctx) { +#if __has_builtin(__builtin_preserve_enum_value) struct core_reloc_module_output *out = (void *)&data.out; __u64 pid_tgid = bpf_get_current_pid_tgid(); __u32 real_tgid = (__u32)(pid_tgid >> 32); @@ -61,6 +62,9 @@ int BPF_PROG(test_core_module_probed, out->len_exists = bpf_core_field_exists(read_ctx->len); out->comm_len = BPF_CORE_READ_STR_INTO(&out->comm, task, comm); +#else + data.skip = true; +#endif return 0; } @@ -70,6 +74,7 @@ int BPF_PROG(test_core_module_direct, struct task_struct *task, struct bpf_testmod_test_read_ctx *read_ctx) { +#if __has_builtin(__builtin_preserve_enum_value) struct core_reloc_module_output *out = (void *)&data.out; __u64 pid_tgid = bpf_get_current_pid_tgid(); __u32 real_tgid = (__u32)(pid_tgid >> 32); @@ -91,6 +96,9 @@ int BPF_PROG(test_core_module_direct, out->len_exists = bpf_core_field_exists(read_ctx->len); out->comm_len = BPF_CORE_READ_STR_INTO(&out->comm, task, comm); +#else + data.skip = true; +#endif return 0; } diff --git a/tools/testing/selftests/bpf/progs/test_module_attach.c b/tools/testing/selftests/bpf/progs/test_module_attach.c index b563563df172..efd1e287ac17 100644 --- a/tools/testing/selftests/bpf/progs/test_module_attach.c +++ b/tools/testing/selftests/bpf/progs/test_module_attach.c @@ -38,6 +38,17 @@ int BPF_PROG(handle_fentry, return 0; } +__u32 fentry_manual_read_sz = 0; + +SEC("fentry/placeholder") +int BPF_PROG(handle_fentry_manual, + struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) +{ + fentry_manual_read_sz = len; + return 0; +} + __u32 fexit_read_sz = 0; int fexit_ret = 0; diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 5ef081bdae4e..7d077d48cadd 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -587,6 +587,16 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) return -EINVAL; } } + + if (env->verbosity > VERBOSE_NONE) { + if (setenv("SELFTESTS_VERBOSE", "1", 1) == -1) { + fprintf(stderr, + "Unable to setenv SELFTESTS_VERBOSE=1 (errno=%d)", + errno); + return -1; + } + } + break; case ARG_GET_TEST_CNT: env->get_test_cnt = true; diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 4bfe3aa2cfc4..777a81404fdb 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -875,19 +875,36 @@ static int do_prog_test_run(int fd_prog, bool unpriv, uint32_t expected_val, __u8 tmp[TEST_DATA_LEN << 2]; __u32 size_tmp = sizeof(tmp); uint32_t retval; - int err; + int err, saved_errno; if (unpriv) set_admin(true); err = bpf_prog_test_run(fd_prog, 1, data, size_data, tmp, &size_tmp, &retval, NULL); + saved_errno = errno; + if (unpriv) set_admin(false); - if (err && errno != 524/*ENOTSUPP*/ && errno != EPERM) { - printf("Unexpected bpf_prog_test_run error "); - return err; + + if (err) { + switch (saved_errno) { + case 524/*ENOTSUPP*/: + printf("Did not run the program (not supported) "); + return 0; + case EPERM: + if (unpriv) { + printf("Did not run the program (no permission) "); + return 0; + } + /* fallthrough; */ + default: + printf("FAIL: Unexpected bpf_prog_test_run error (%s) ", + strerror(saved_errno)); + return err; + } } - if (!err && retval != expected_val && + + if (retval != expected_val && expected_val != POINTER_VALUE) { printf("FAIL retval %d != %d ", retval, expected_val); return 1; @@ -936,6 +953,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv, int run_errs, run_successes; int map_fds[MAX_NR_MAPS]; const char *expected_err; + int saved_errno; int fixup_skips; __u32 pflags; int i, err; @@ -997,6 +1015,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv, } fd_prog = bpf_load_program_xattr(&attr, bpf_vlog, sizeof(bpf_vlog)); + saved_errno = errno; /* BPF_PROG_TYPE_TRACING requires more setup and * bpf_probe_prog_type won't give correct answer @@ -1013,7 +1032,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv, if (expected_ret == ACCEPT || expected_ret == VERBOSE_ACCEPT) { if (fd_prog < 0) { printf("FAIL\nFailed to load prog '%s'!\n", - strerror(errno)); + strerror(saved_errno)); goto fail_log; } #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh new file mode 100755 index 000000000000..88a7483eaae4 --- /dev/null +++ b/tools/testing/selftests/bpf/test_xsk.sh @@ -0,0 +1,259 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright(c) 2020 Intel Corporation, Weqaar Janjua <weqaar.a.janjua@intel.com> + +# AF_XDP selftests based on veth +# +# End-to-end AF_XDP over Veth test +# +# Topology: +# --------- +# ----------- +# _ | Process | _ +# / ----------- \ +# / | \ +# / | \ +# ----------- | ----------- +# | Thread1 | | | Thread2 | +# ----------- | ----------- +# | | | +# ----------- | ----------- +# | xskX | | | xskY | +# ----------- | ----------- +# | | | +# ----------- | ---------- +# | vethX | --------- | vethY | +# ----------- peer ---------- +# | | | +# namespaceX | namespaceY +# +# AF_XDP is an address family optimized for high performance packet processing, +# it is XDP’s user-space interface. +# +# An AF_XDP socket is linked to a single UMEM which is a region of virtual +# contiguous memory, divided into equal-sized frames. +# +# Refer to AF_XDP Kernel Documentation for detailed information: +# https://www.kernel.org/doc/html/latest/networking/af_xdp.html +# +# Prerequisites setup by script: +# +# Set up veth interfaces as per the topology shown ^^: +# * setup two veth interfaces and one namespace +# ** veth<xxxx> in root namespace +# ** veth<yyyy> in af_xdp<xxxx> namespace +# ** namespace af_xdp<xxxx> +# * create a spec file veth.spec that includes this run-time configuration +# *** xxxx and yyyy are randomly generated 4 digit numbers used to avoid +# conflict with any existing interface +# * tests the veth and xsk layers of the topology +# +# See the source xdpxceiver.c for information on each test +# +# Kernel configuration: +# --------------------- +# See "config" file for recommended kernel config options. +# +# Turn on XDP sockets and veth support when compiling i.e. +# Networking support --> +# Networking options --> +# [ * ] XDP sockets +# +# Executing Tests: +# ---------------- +# Must run with CAP_NET_ADMIN capability. +# +# Run (full color-coded output): +# sudo ./test_xsk.sh -c +# +# If running from kselftests: +# sudo make colorconsole=1 run_tests +# +# Run (full output without color-coding): +# sudo ./test_xsk.sh + +. xsk_prereqs.sh + +while getopts c flag +do + case "${flag}" in + c) colorconsole=1;; + esac +done + +TEST_NAME="PREREQUISITES" + +URANDOM=/dev/urandom +[ ! -e "${URANDOM}" ] && { echo "${URANDOM} not found. Skipping tests."; test_exit 1 1; } + +VETH0_POSTFIX=$(cat ${URANDOM} | tr -dc '0-9' | fold -w 256 | head -n 1 | head --bytes 4) +VETH0=ve${VETH0_POSTFIX} +VETH1_POSTFIX=$(cat ${URANDOM} | tr -dc '0-9' | fold -w 256 | head -n 1 | head --bytes 4) +VETH1=ve${VETH1_POSTFIX} +NS0=root +NS1=af_xdp${VETH1_POSTFIX} +MTU=1500 + +setup_vethPairs() { + echo "setting up ${VETH0}: namespace: ${NS0}" + ip netns add ${NS1} + ip link add ${VETH0} type veth peer name ${VETH1} + if [ -f /proc/net/if_inet6 ]; then + echo 1 > /proc/sys/net/ipv6/conf/${VETH0}/disable_ipv6 + fi + echo "setting up ${VETH1}: namespace: ${NS1}" + ip link set ${VETH1} netns ${NS1} + ip netns exec ${NS1} ip link set ${VETH1} mtu ${MTU} + ip link set ${VETH0} mtu ${MTU} + ip netns exec ${NS1} ip link set ${VETH1} up + ip link set ${VETH0} up +} + +validate_root_exec +validate_veth_support ${VETH0} +validate_ip_utility +setup_vethPairs + +retval=$? +if [ $retval -ne 0 ]; then + test_status $retval "${TEST_NAME}" + cleanup_exit ${VETH0} ${VETH1} ${NS1} + exit $retval +fi + +echo "${VETH0}:${VETH1},${NS1}" > ${SPECFILE} + +validate_veth_spec_file + +echo "Spec file created: ${SPECFILE}" + +test_status $retval "${TEST_NAME}" + +## START TESTS + +statusList=() + +### TEST 1 +TEST_NAME="XSK KSELFTEST FRAMEWORK" + +echo "Switching interfaces [${VETH0}, ${VETH1}] to XDP Generic mode" +vethXDPgeneric ${VETH0} ${VETH1} ${NS1} + +retval=$? +if [ $retval -eq 0 ]; then + echo "Switching interfaces [${VETH0}, ${VETH1}] to XDP Native mode" + vethXDPnative ${VETH0} ${VETH1} ${NS1} +fi + +retval=$? +test_status $retval "${TEST_NAME}" +statusList+=($retval) + +### TEST 2 +TEST_NAME="SKB NOPOLL" + +vethXDPgeneric ${VETH0} ${VETH1} ${NS1} + +params=("-S") +execxdpxceiver params + +retval=$? +test_status $retval "${TEST_NAME}" +statusList+=($retval) + +### TEST 3 +TEST_NAME="SKB POLL" + +vethXDPgeneric ${VETH0} ${VETH1} ${NS1} + +params=("-S" "-p") +execxdpxceiver params + +retval=$? +test_status $retval "${TEST_NAME}" +statusList+=($retval) + +### TEST 4 +TEST_NAME="DRV NOPOLL" + +vethXDPnative ${VETH0} ${VETH1} ${NS1} + +params=("-N") +execxdpxceiver params + +retval=$? +test_status $retval "${TEST_NAME}" +statusList+=($retval) + +### TEST 5 +TEST_NAME="DRV POLL" + +vethXDPnative ${VETH0} ${VETH1} ${NS1} + +params=("-N" "-p") +execxdpxceiver params + +retval=$? +test_status $retval "${TEST_NAME}" +statusList+=($retval) + +### TEST 6 +TEST_NAME="SKB SOCKET TEARDOWN" + +vethXDPgeneric ${VETH0} ${VETH1} ${NS1} + +params=("-S" "-T") +execxdpxceiver params + +retval=$? +test_status $retval "${TEST_NAME}" +statusList+=($retval) + +### TEST 7 +TEST_NAME="DRV SOCKET TEARDOWN" + +vethXDPnative ${VETH0} ${VETH1} ${NS1} + +params=("-N" "-T") +execxdpxceiver params + +retval=$? +test_status $retval "${TEST_NAME}" +statusList+=($retval) + +### TEST 8 +TEST_NAME="SKB BIDIRECTIONAL SOCKETS" + +vethXDPgeneric ${VETH0} ${VETH1} ${NS1} + +params=("-S" "-B") +execxdpxceiver params + +retval=$? +test_status $retval "${TEST_NAME}" +statusList+=($retval) + +### TEST 9 +TEST_NAME="DRV BIDIRECTIONAL SOCKETS" + +vethXDPnative ${VETH0} ${VETH1} ${NS1} + +params=("-N" "-B") +execxdpxceiver params + +retval=$? +test_status $retval "${TEST_NAME}" +statusList+=($retval) + +## END TESTS + +cleanup_exit ${VETH0} ${VETH1} ${NS1} + +for _status in "${statusList[@]}" +do + if [ $_status -ne 0 ]; then + test_exit $ksft_fail 0 + fi +done + +test_exit $ksft_pass 0 diff --git a/tools/testing/selftests/bpf/verifier/unpriv.c b/tools/testing/selftests/bpf/verifier/unpriv.c index 91bb77c24a2e..a3fe0fbaed41 100644 --- a/tools/testing/selftests/bpf/verifier/unpriv.c +++ b/tools/testing/selftests/bpf/verifier/unpriv.c @@ -108,8 +108,9 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "invalid indirect read from stack off -8+0 size 8", - .result = REJECT, + .errstr_unpriv = "invalid indirect read from stack off -8+0 size 8", + .result_unpriv = REJECT, + .result = ACCEPT, }, { "unpriv: mangle pointer on stack 1", diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c new file mode 100644 index 000000000000..014dedaa4dd2 --- /dev/null +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -0,0 +1,1074 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2020 Intel Corporation. */ + +/* + * Some functions in this program are taken from + * Linux kernel samples/bpf/xdpsock* and modified + * for use. + * + * See test_xsk.sh for detailed information on test topology + * and prerequisite network setup. + * + * This test program contains two threads, each thread is single socket with + * a unique UMEM. It validates in-order packet delivery and packet content + * by sending packets to each other. + * + * Tests Information: + * ------------------ + * These selftests test AF_XDP SKB and Native/DRV modes using veth + * Virtual Ethernet interfaces. + * + * The following tests are run: + * + * 1. AF_XDP SKB mode + * Generic mode XDP is driver independent, used when the driver does + * not have support for XDP. Works on any netdevice using sockets and + * generic XDP path. XDP hook from netif_receive_skb(). + * a. nopoll - soft-irq processing + * b. poll - using poll() syscall + * c. Socket Teardown + * Create a Tx and a Rx socket, Tx from one socket, Rx on another. Destroy + * both sockets, then repeat multiple times. Only nopoll mode is used + * d. Bi-directional sockets + * Configure sockets as bi-directional tx/rx sockets, sets up fill and + * completion rings on each socket, tx/rx in both directions. Only nopoll + * mode is used + * + * 2. AF_XDP DRV/Native mode + * Works on any netdevice with XDP_REDIRECT support, driver dependent. Processes + * packets before SKB allocation. Provides better performance than SKB. Driver + * hook available just after DMA of buffer descriptor. + * a. nopoll + * b. poll + * c. Socket Teardown + * d. Bi-directional sockets + * - Only copy mode is supported because veth does not currently support + * zero-copy mode + * + * Total tests: 8 + * + * Flow: + * ----- + * - Single process spawns two threads: Tx and Rx + * - Each of these two threads attach to a veth interface within their assigned + * namespaces + * - Each thread Creates one AF_XDP socket connected to a unique umem for each + * veth interface + * - Tx thread Transmits 10k packets from veth<xxxx> to veth<yyyy> + * - Rx thread verifies if all 10k packets were received and delivered in-order, + * and have the right content + * + * Enable/disable debug mode: + * -------------------------- + * To enable L2 - L4 headers and payload dump of each packet on STDOUT, add + * parameter -D to params array in test_xsk.sh, i.e. params=("-S" "-D") + */ + +#define _GNU_SOURCE +#include <fcntl.h> +#include <errno.h> +#include <getopt.h> +#include <asm/barrier.h> +typedef __u16 __sum16; +#include <linux/if_link.h> +#include <linux/if_ether.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <arpa/inet.h> +#include <net/if.h> +#include <locale.h> +#include <poll.h> +#include <pthread.h> +#include <signal.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stddef.h> +#include <sys/mman.h> +#include <sys/resource.h> +#include <sys/types.h> +#include <sys/queue.h> +#include <time.h> +#include <unistd.h> +#include <stdatomic.h> +#include <bpf/xsk.h> +#include "xdpxceiver.h" +#include "../kselftest.h" + +static void __exit_with_error(int error, const char *file, const char *func, int line) +{ + ksft_test_result_fail + ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error)); + ksft_exit_xfail(); +} + +#define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__) + +#define print_ksft_result(void)\ + (ksft_test_result_pass("PASS: %s %s %s%s\n", uut ? "DRV" : "SKB", opt_poll ? "POLL" :\ + "NOPOLL", opt_teardown ? "Socket Teardown" : "",\ + opt_bidi ? "Bi-directional Sockets" : "")) + +static void pthread_init_mutex(void) +{ + pthread_mutex_init(&sync_mutex, NULL); + pthread_mutex_init(&sync_mutex_tx, NULL); + pthread_cond_init(&signal_rx_condition, NULL); + pthread_cond_init(&signal_tx_condition, NULL); +} + +static void pthread_destroy_mutex(void) +{ + pthread_mutex_destroy(&sync_mutex); + pthread_mutex_destroy(&sync_mutex_tx); + pthread_cond_destroy(&signal_rx_condition); + pthread_cond_destroy(&signal_tx_condition); +} + +static void *memset32_htonl(void *dest, u32 val, u32 size) +{ + u32 *ptr = (u32 *)dest; + int i; + + val = htonl(val); + + for (i = 0; i < (size & (~0x3)); i += 4) + ptr[i >> 2] = val; + + for (; i < size; i++) + ((char *)dest)[i] = ((char *)&val)[i & 3]; + + return dest; +} + +/* + * This function code has been taken from + * Linux kernel lib/checksum.c + */ +static inline unsigned short from32to16(unsigned int x) +{ + /* add up 16-bit and 16-bit for 16+c bit */ + x = (x & 0xffff) + (x >> 16); + /* add up carry.. */ + x = (x & 0xffff) + (x >> 16); + return x; +} + +/* + * Fold a partial checksum + * This function code has been taken from + * Linux kernel include/asm-generic/checksum.h + */ +static inline __u16 csum_fold(__u32 csum) +{ + u32 sum = (__force u32)csum; + + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + return (__force __u16)~sum; +} + +/* + * This function code has been taken from + * Linux kernel lib/checksum.c + */ +static inline u32 from64to32(u64 x) +{ + /* add up 32-bit and 32-bit for 32+c bit */ + x = (x & 0xffffffff) + (x >> 32); + /* add up carry.. */ + x = (x & 0xffffffff) + (x >> 32); + return (u32)x; +} + +__u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum); + +/* + * This function code has been taken from + * Linux kernel lib/checksum.c + */ +__u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) +{ + unsigned long long s = (__force u32)sum; + + s += (__force u32)saddr; + s += (__force u32)daddr; +#ifdef __BIG_ENDIAN__ + s += proto + len; +#else + s += (proto + len) << 8; +#endif + return (__force __u32)from64to32(s); +} + +/* + * This function has been taken from + * Linux kernel include/asm-generic/checksum.h + */ +static inline __u16 +csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) +{ + return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); +} + +static inline u16 udp_csum(u32 saddr, u32 daddr, u32 len, u8 proto, u16 *udp_pkt) +{ + u32 csum = 0; + u32 cnt = 0; + + /* udp hdr and data */ + for (; cnt < len; cnt += 2) + csum += udp_pkt[cnt >> 1]; + + return csum_tcpudp_magic(saddr, daddr, len, proto, csum); +} + +static void gen_eth_hdr(void *data, struct ethhdr *eth_hdr) +{ + memcpy(eth_hdr->h_dest, ((struct ifobject *)data)->dst_mac, ETH_ALEN); + memcpy(eth_hdr->h_source, ((struct ifobject *)data)->src_mac, ETH_ALEN); + eth_hdr->h_proto = htons(ETH_P_IP); +} + +static void gen_ip_hdr(void *data, struct iphdr *ip_hdr) +{ + ip_hdr->version = IP_PKT_VER; + ip_hdr->ihl = 0x5; + ip_hdr->tos = IP_PKT_TOS; + ip_hdr->tot_len = htons(IP_PKT_SIZE); + ip_hdr->id = 0; + ip_hdr->frag_off = 0; + ip_hdr->ttl = IPDEFTTL; + ip_hdr->protocol = IPPROTO_UDP; + ip_hdr->saddr = ((struct ifobject *)data)->src_ip; + ip_hdr->daddr = ((struct ifobject *)data)->dst_ip; + ip_hdr->check = 0; +} + +static void gen_udp_hdr(void *data, void *arg, struct udphdr *udp_hdr) +{ + udp_hdr->source = htons(((struct ifobject *)arg)->src_port); + udp_hdr->dest = htons(((struct ifobject *)arg)->dst_port); + udp_hdr->len = htons(UDP_PKT_SIZE); + memset32_htonl(pkt_data + PKT_HDR_SIZE, + htonl(((struct generic_data *)data)->seqnum), UDP_PKT_DATA_SIZE); +} + +static void gen_udp_csum(struct udphdr *udp_hdr, struct iphdr *ip_hdr) +{ + udp_hdr->check = 0; + udp_hdr->check = + udp_csum(ip_hdr->saddr, ip_hdr->daddr, UDP_PKT_SIZE, IPPROTO_UDP, (u16 *)udp_hdr); +} + +static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr) +{ + memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data, PKT_SIZE); +} + +static void xsk_configure_umem(struct ifobject *data, void *buffer, u64 size) +{ + int ret; + + data->umem = calloc(1, sizeof(struct xsk_umem_info)); + if (!data->umem) + exit_with_error(errno); + + ret = xsk_umem__create(&data->umem->umem, buffer, size, + &data->umem->fq, &data->umem->cq, NULL); + if (ret) + exit_with_error(ret); + + data->umem->buffer = buffer; +} + +static void xsk_populate_fill_ring(struct xsk_umem_info *umem) +{ + int ret, i; + u32 idx; + + ret = xsk_ring_prod__reserve(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS, &idx); + if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS) + exit_with_error(ret); + for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS; i++) + *xsk_ring_prod__fill_addr(&umem->fq, idx++) = i * XSK_UMEM__DEFAULT_FRAME_SIZE; + xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS); +} + +static int xsk_configure_socket(struct ifobject *ifobject) +{ + struct xsk_socket_config cfg; + struct xsk_ring_cons *rxr; + struct xsk_ring_prod *txr; + int ret; + + ifobject->xsk = calloc(1, sizeof(struct xsk_socket_info)); + if (!ifobject->xsk) + exit_with_error(errno); + + ifobject->xsk->umem = ifobject->umem; + cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; + cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + cfg.libbpf_flags = 0; + cfg.xdp_flags = opt_xdp_flags; + cfg.bind_flags = opt_xdp_bind_flags; + + if (!opt_bidi) { + rxr = (ifobject->fv.vector == rx) ? &ifobject->xsk->rx : NULL; + txr = (ifobject->fv.vector == tx) ? &ifobject->xsk->tx : NULL; + } else { + rxr = &ifobject->xsk->rx; + txr = &ifobject->xsk->tx; + } + + ret = xsk_socket__create(&ifobject->xsk->xsk, ifobject->ifname, + opt_queue, ifobject->umem->umem, rxr, txr, &cfg); + + if (ret) + return 1; + + return 0; +} + +static struct option long_options[] = { + {"interface", required_argument, 0, 'i'}, + {"queue", optional_argument, 0, 'q'}, + {"poll", no_argument, 0, 'p'}, + {"xdp-skb", no_argument, 0, 'S'}, + {"xdp-native", no_argument, 0, 'N'}, + {"copy", no_argument, 0, 'c'}, + {"tear-down", no_argument, 0, 'T'}, + {"bidi", optional_argument, 0, 'B'}, + {"debug", optional_argument, 0, 'D'}, + {"tx-pkt-count", optional_argument, 0, 'C'}, + {0, 0, 0, 0} +}; + +static void usage(const char *prog) +{ + const char *str = + " Usage: %s [OPTIONS]\n" + " Options:\n" + " -i, --interface Use interface\n" + " -q, --queue=n Use queue n (default 0)\n" + " -p, --poll Use poll syscall\n" + " -S, --xdp-skb=n Use XDP SKB mode\n" + " -N, --xdp-native=n Enforce XDP DRV (native) mode\n" + " -c, --copy Force copy mode\n" + " -T, --tear-down Tear down sockets by repeatedly recreating them\n" + " -B, --bidi Bi-directional sockets test\n" + " -D, --debug Debug mode - dump packets L2 - L5\n" + " -C, --tx-pkt-count=n Number of packets to send\n"; + ksft_print_msg(str, prog); +} + +static bool switch_namespace(int idx) +{ + char fqns[26] = "/var/run/netns/"; + int nsfd; + + strncat(fqns, ifdict[idx]->nsname, sizeof(fqns) - strlen(fqns) - 1); + nsfd = open(fqns, O_RDONLY); + + if (nsfd == -1) + exit_with_error(errno); + + if (setns(nsfd, 0) == -1) + exit_with_error(errno); + + return true; +} + +static void *nsswitchthread(void *args) +{ + if (switch_namespace(((struct targs *)args)->idx)) { + ifdict[((struct targs *)args)->idx]->ifindex = + if_nametoindex(ifdict[((struct targs *)args)->idx]->ifname); + if (!ifdict[((struct targs *)args)->idx]->ifindex) { + ksft_test_result_fail + ("ERROR: [%s] interface \"%s\" does not exist\n", + __func__, ifdict[((struct targs *)args)->idx]->ifname); + ((struct targs *)args)->retptr = false; + } else { + ksft_print_msg("Interface found: %s\n", + ifdict[((struct targs *)args)->idx]->ifname); + ((struct targs *)args)->retptr = true; + } + } else { + ((struct targs *)args)->retptr = false; + } + pthread_exit(NULL); +} + +static int validate_interfaces(void) +{ + bool ret = true; + + for (int i = 0; i < MAX_INTERFACES; i++) { + if (!strcmp(ifdict[i]->ifname, "")) { + ret = false; + ksft_test_result_fail("ERROR: interfaces: -i <int>,<ns> -i <int>,<ns>."); + } + if (strcmp(ifdict[i]->nsname, "")) { + struct targs *targs; + + targs = (struct targs *)malloc(sizeof(struct targs)); + if (!targs) + exit_with_error(errno); + + targs->idx = i; + if (pthread_create(&ns_thread, NULL, nsswitchthread, (void *)targs)) + exit_with_error(errno); + + pthread_join(ns_thread, NULL); + + if (targs->retptr) + ksft_print_msg("NS switched: %s\n", ifdict[i]->nsname); + + free(targs); + } else { + ifdict[i]->ifindex = if_nametoindex(ifdict[i]->ifname); + if (!ifdict[i]->ifindex) { + ksft_test_result_fail + ("ERROR: interface \"%s\" does not exist\n", ifdict[i]->ifname); + ret = false; + } else { + ksft_print_msg("Interface found: %s\n", ifdict[i]->ifname); + } + } + } + return ret; +} + +static void parse_command_line(int argc, char **argv) +{ + int option_index, interface_index = 0, c; + + opterr = 0; + + for (;;) { + c = getopt_long(argc, argv, "i:q:pSNcTBDC:", long_options, &option_index); + + if (c == -1) + break; + + switch (c) { + case 'i': + if (interface_index == MAX_INTERFACES) + break; + char *sptr, *token; + + sptr = strndupa(optarg, strlen(optarg)); + memcpy(ifdict[interface_index]->ifname, + strsep(&sptr, ","), MAX_INTERFACE_NAME_CHARS); + token = strsep(&sptr, ","); + if (token) + memcpy(ifdict[interface_index]->nsname, token, + MAX_INTERFACES_NAMESPACE_CHARS); + interface_index++; + break; + case 'q': + opt_queue = atoi(optarg); + break; + case 'p': + opt_poll = 1; + break; + case 'S': + opt_xdp_flags |= XDP_FLAGS_SKB_MODE; + opt_xdp_bind_flags |= XDP_COPY; + uut = ORDER_CONTENT_VALIDATE_XDP_SKB; + break; + case 'N': + opt_xdp_flags |= XDP_FLAGS_DRV_MODE; + opt_xdp_bind_flags |= XDP_COPY; + uut = ORDER_CONTENT_VALIDATE_XDP_DRV; + break; + case 'c': + opt_xdp_bind_flags |= XDP_COPY; + break; + case 'T': + opt_teardown = 1; + break; + case 'B': + opt_bidi = 1; + break; + case 'D': + debug_pkt_dump = 1; + break; + case 'C': + opt_pkt_count = atoi(optarg); + break; + default: + usage(basename(argv[0])); + ksft_exit_xfail(); + } + } + + if (!validate_interfaces()) { + usage(basename(argv[0])); + ksft_exit_xfail(); + } +} + +static void kick_tx(struct xsk_socket_info *xsk) +{ + int ret; + + ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); + if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN || errno == EBUSY || errno == ENETDOWN) + return; + exit_with_error(errno); +} + +static inline void complete_tx_only(struct xsk_socket_info *xsk, int batch_size) +{ + unsigned int rcvd; + u32 idx; + + if (!xsk->outstanding_tx) + return; + + if (!NEED_WAKEUP || xsk_ring_prod__needs_wakeup(&xsk->tx)) + kick_tx(xsk); + + rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx); + if (rcvd) { + xsk_ring_cons__release(&xsk->umem->cq, rcvd); + xsk->outstanding_tx -= rcvd; + xsk->tx_npkts += rcvd; + } +} + +static void rx_pkt(struct xsk_socket_info *xsk, struct pollfd *fds) +{ + unsigned int rcvd, i; + u32 idx_rx = 0, idx_fq = 0; + int ret; + + rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); + if (!rcvd) { + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { + ret = poll(fds, 1, POLL_TMOUT); + if (ret < 0) + exit_with_error(ret); + } + return; + } + + ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); + while (ret != rcvd) { + if (ret < 0) + exit_with_error(ret); + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { + ret = poll(fds, 1, POLL_TMOUT); + if (ret < 0) + exit_with_error(ret); + } + ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); + } + + for (i = 0; i < rcvd; i++) { + u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; + (void)xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; + u64 orig = xsk_umem__extract_addr(addr); + + addr = xsk_umem__add_offset_to_addr(addr); + pkt_node_rx = malloc(sizeof(struct pkt) + PKT_SIZE); + if (!pkt_node_rx) + exit_with_error(errno); + + pkt_node_rx->pkt_frame = (char *)malloc(PKT_SIZE); + if (!pkt_node_rx->pkt_frame) + exit_with_error(errno); + + memcpy(pkt_node_rx->pkt_frame, xsk_umem__get_data(xsk->umem->buffer, addr), + PKT_SIZE); + + TAILQ_INSERT_HEAD(&head, pkt_node_rx, pkt_nodes); + + *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig; + } + + xsk_ring_prod__submit(&xsk->umem->fq, rcvd); + xsk_ring_cons__release(&xsk->rx, rcvd); + xsk->rx_npkts += rcvd; +} + +static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size) +{ + u32 idx; + unsigned int i; + + while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) < batch_size) + complete_tx_only(xsk, batch_size); + + for (i = 0; i < batch_size; i++) { + struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); + + tx_desc->addr = (*frameptr + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT; + tx_desc->len = PKT_SIZE; + } + + xsk_ring_prod__submit(&xsk->tx, batch_size); + xsk->outstanding_tx += batch_size; + *frameptr += batch_size; + *frameptr %= num_frames; + complete_tx_only(xsk, batch_size); +} + +static inline int get_batch_size(int pkt_cnt) +{ + if (!opt_pkt_count) + return BATCH_SIZE; + + if (pkt_cnt + BATCH_SIZE <= opt_pkt_count) + return BATCH_SIZE; + + return opt_pkt_count - pkt_cnt; +} + +static void complete_tx_only_all(void *arg) +{ + bool pending; + + do { + pending = false; + if (((struct ifobject *)arg)->xsk->outstanding_tx) { + complete_tx_only(((struct ifobject *) + arg)->xsk, BATCH_SIZE); + pending = !!((struct ifobject *)arg)->xsk->outstanding_tx; + } + } while (pending); +} + +static void tx_only_all(void *arg) +{ + struct pollfd fds[MAX_SOCKS] = { }; + u32 frame_nb = 0; + int pkt_cnt = 0; + int ret; + + fds[0].fd = xsk_socket__fd(((struct ifobject *)arg)->xsk->xsk); + fds[0].events = POLLOUT; + + while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) { + int batch_size = get_batch_size(pkt_cnt); + + if (opt_poll) { + ret = poll(fds, 1, POLL_TMOUT); + if (ret <= 0) + continue; + + if (!(fds[0].revents & POLLOUT)) + continue; + } + + tx_only(((struct ifobject *)arg)->xsk, &frame_nb, batch_size); + pkt_cnt += batch_size; + } + + if (opt_pkt_count) + complete_tx_only_all(arg); +} + +static void worker_pkt_dump(void) +{ + struct in_addr ipaddr; + + fprintf(stdout, "---------------------------------------\n"); + for (int iter = 0; iter < num_frames - 1; iter++) { + /*extract L2 frame */ + fprintf(stdout, "DEBUG>> L2: dst mac: "); + for (int i = 0; i < ETH_ALEN; i++) + fprintf(stdout, "%02X", ((struct ethhdr *) + pkt_buf[iter]->payload)->h_dest[i]); + + fprintf(stdout, "\nDEBUG>> L2: src mac: "); + for (int i = 0; i < ETH_ALEN; i++) + fprintf(stdout, "%02X", ((struct ethhdr *) + pkt_buf[iter]->payload)->h_source[i]); + + /*extract L3 frame */ + fprintf(stdout, "\nDEBUG>> L3: ip_hdr->ihl: %02X\n", + ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->ihl); + + ipaddr.s_addr = + ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->saddr; + fprintf(stdout, "DEBUG>> L3: ip_hdr->saddr: %s\n", inet_ntoa(ipaddr)); + + ipaddr.s_addr = + ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->daddr; + fprintf(stdout, "DEBUG>> L3: ip_hdr->daddr: %s\n", inet_ntoa(ipaddr)); + + /*extract L4 frame */ + fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n", + ntohs(((struct udphdr *)(pkt_buf[iter]->payload + + sizeof(struct ethhdr) + + sizeof(struct iphdr)))->source)); + + fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n", + ntohs(((struct udphdr *)(pkt_buf[iter]->payload + + sizeof(struct ethhdr) + + sizeof(struct iphdr)))->dest)); + /*extract L5 frame */ + int payload = *((uint32_t *)(pkt_buf[iter]->payload + PKT_HDR_SIZE)); + + if (payload == EOT) { + ksft_print_msg("End-of-tranmission frame received\n"); + fprintf(stdout, "---------------------------------------\n"); + break; + } + fprintf(stdout, "DEBUG>> L5: payload: %d\n", payload); + fprintf(stdout, "---------------------------------------\n"); + } +} + +static void worker_pkt_validate(void) +{ + u32 payloadseqnum = -2; + + while (1) { + pkt_node_rx_q = malloc(sizeof(struct pkt)); + pkt_node_rx_q = TAILQ_LAST(&head, head_s); + if (!pkt_node_rx_q) + break; + /*do not increment pktcounter if !(tos=0x9 and ipv4) */ + if ((((struct iphdr *)(pkt_node_rx_q->pkt_frame + + sizeof(struct ethhdr)))->version == IP_PKT_VER) + && (((struct iphdr *)(pkt_node_rx_q->pkt_frame + sizeof(struct ethhdr)))->tos == + IP_PKT_TOS)) { + payloadseqnum = *((uint32_t *) (pkt_node_rx_q->pkt_frame + PKT_HDR_SIZE)); + if (debug_pkt_dump && payloadseqnum != EOT) { + pkt_obj = (struct pkt_frame *)malloc(sizeof(struct pkt_frame)); + pkt_obj->payload = (char *)malloc(PKT_SIZE); + memcpy(pkt_obj->payload, pkt_node_rx_q->pkt_frame, PKT_SIZE); + pkt_buf[payloadseqnum] = pkt_obj; + } + + if (payloadseqnum == EOT) { + ksft_print_msg("End-of-tranmission frame received: PASS\n"); + sigvar = 1; + break; + } + + if (prev_pkt + 1 != payloadseqnum) { + ksft_test_result_fail + ("ERROR: [%s] prev_pkt [%d], payloadseqnum [%d]\n", + __func__, prev_pkt, payloadseqnum); + ksft_exit_xfail(); + } + + TAILQ_REMOVE(&head, pkt_node_rx_q, pkt_nodes); + free(pkt_node_rx_q->pkt_frame); + free(pkt_node_rx_q); + pkt_node_rx_q = NULL; + prev_pkt = payloadseqnum; + pkt_counter++; + } else { + ksft_print_msg("Invalid frame received: "); + ksft_print_msg("[IP_PKT_VER: %02X], [IP_PKT_TOS: %02X]\n", + ((struct iphdr *)(pkt_node_rx_q->pkt_frame + + sizeof(struct ethhdr)))->version, + ((struct iphdr *)(pkt_node_rx_q->pkt_frame + + sizeof(struct ethhdr)))->tos); + TAILQ_REMOVE(&head, pkt_node_rx_q, pkt_nodes); + free(pkt_node_rx_q->pkt_frame); + free(pkt_node_rx_q); + pkt_node_rx_q = NULL; + } + } +} + +static void thread_common_ops(void *arg, void *bufs, pthread_mutex_t *mutexptr, + atomic_int *spinningptr) +{ + int ctr = 0; + int ret; + + xsk_configure_umem((struct ifobject *)arg, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE); + ret = xsk_configure_socket((struct ifobject *)arg); + + /* Retry Create Socket if it fails as xsk_socket__create() + * is asynchronous + * + * Essential to lock Mutex here to prevent Tx thread from + * entering before Rx and causing a deadlock + */ + pthread_mutex_lock(mutexptr); + while (ret && ctr < SOCK_RECONF_CTR) { + atomic_store(spinningptr, 1); + xsk_configure_umem((struct ifobject *)arg, + bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE); + ret = xsk_configure_socket((struct ifobject *)arg); + usleep(USLEEP_MAX); + ctr++; + } + atomic_store(spinningptr, 0); + pthread_mutex_unlock(mutexptr); + + if (ctr >= SOCK_RECONF_CTR) + exit_with_error(ret); +} + +static void *worker_testapp_validate(void *arg) +{ + struct udphdr *udp_hdr = + (struct udphdr *)(pkt_data + sizeof(struct ethhdr) + sizeof(struct iphdr)); + struct generic_data *data = (struct generic_data *)malloc(sizeof(struct generic_data)); + struct iphdr *ip_hdr = (struct iphdr *)(pkt_data + sizeof(struct ethhdr)); + struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data; + void *bufs = NULL; + + pthread_attr_setstacksize(&attr, THREAD_STACK); + + if (!bidi_pass) { + bufs = mmap(NULL, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE, + PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (bufs == MAP_FAILED) + exit_with_error(errno); + + if (strcmp(((struct ifobject *)arg)->nsname, "")) + switch_namespace(((struct ifobject *)arg)->ifdict_index); + } + + if (((struct ifobject *)arg)->fv.vector == tx) { + int spinningrxctr = 0; + + if (!bidi_pass) + thread_common_ops(arg, bufs, &sync_mutex_tx, &spinning_tx); + + while (atomic_load(&spinning_rx) && spinningrxctr < SOCK_RECONF_CTR) { + spinningrxctr++; + usleep(USLEEP_MAX); + } + + ksft_print_msg("Interface [%s] vector [Tx]\n", ((struct ifobject *)arg)->ifname); + for (int i = 0; i < num_frames; i++) { + /*send EOT frame */ + if (i == (num_frames - 1)) + data->seqnum = -1; + else + data->seqnum = i; + gen_udp_hdr((void *)data, (void *)arg, udp_hdr); + gen_ip_hdr((void *)arg, ip_hdr); + gen_udp_csum(udp_hdr, ip_hdr); + gen_eth_hdr((void *)arg, eth_hdr); + gen_eth_frame(((struct ifobject *)arg)->umem, + i * XSK_UMEM__DEFAULT_FRAME_SIZE); + } + + free(data); + ksft_print_msg("Sending %d packets on interface %s\n", + (opt_pkt_count - 1), ((struct ifobject *)arg)->ifname); + tx_only_all(arg); + } else if (((struct ifobject *)arg)->fv.vector == rx) { + struct pollfd fds[MAX_SOCKS] = { }; + int ret; + + if (!bidi_pass) + thread_common_ops(arg, bufs, &sync_mutex_tx, &spinning_rx); + + ksft_print_msg("Interface [%s] vector [Rx]\n", ((struct ifobject *)arg)->ifname); + xsk_populate_fill_ring(((struct ifobject *)arg)->umem); + + TAILQ_INIT(&head); + if (debug_pkt_dump) { + pkt_buf = malloc(sizeof(struct pkt_frame **) * num_frames); + if (!pkt_buf) + exit_with_error(errno); + } + + fds[0].fd = xsk_socket__fd(((struct ifobject *)arg)->xsk->xsk); + fds[0].events = POLLIN; + + pthread_mutex_lock(&sync_mutex); + pthread_cond_signal(&signal_rx_condition); + pthread_mutex_unlock(&sync_mutex); + + while (1) { + if (opt_poll) { + ret = poll(fds, 1, POLL_TMOUT); + if (ret <= 0) + continue; + } + rx_pkt(((struct ifobject *)arg)->xsk, fds); + worker_pkt_validate(); + + if (sigvar) + break; + } + + ksft_print_msg("Received %d packets on interface %s\n", + pkt_counter, ((struct ifobject *)arg)->ifname); + + if (opt_teardown) + ksft_print_msg("Destroying socket\n"); + } + + if (!opt_bidi || (opt_bidi && bidi_pass)) { + xsk_socket__delete(((struct ifobject *)arg)->xsk->xsk); + (void)xsk_umem__delete(((struct ifobject *)arg)->umem->umem); + } + pthread_exit(NULL); +} + +static void testapp_validate(void) +{ + pthread_attr_init(&attr); + pthread_attr_setstacksize(&attr, THREAD_STACK); + + if (opt_bidi && bidi_pass) { + pthread_init_mutex(); + if (!switching_notify) { + ksft_print_msg("Switching Tx/Rx vectors\n"); + switching_notify++; + } + } + + pthread_mutex_lock(&sync_mutex); + + /*Spawn RX thread */ + if (!opt_bidi || (opt_bidi && !bidi_pass)) { + if (pthread_create(&t0, &attr, worker_testapp_validate, (void *)ifdict[1])) + exit_with_error(errno); + } else if (opt_bidi && bidi_pass) { + /*switch Tx/Rx vectors */ + ifdict[0]->fv.vector = rx; + if (pthread_create(&t0, &attr, worker_testapp_validate, (void *)ifdict[0])) + exit_with_error(errno); + } + + struct timespec max_wait = { 0, 0 }; + + if (clock_gettime(CLOCK_REALTIME, &max_wait)) + exit_with_error(errno); + max_wait.tv_sec += TMOUT_SEC; + + if (pthread_cond_timedwait(&signal_rx_condition, &sync_mutex, &max_wait) == ETIMEDOUT) + exit_with_error(errno); + + pthread_mutex_unlock(&sync_mutex); + + /*Spawn TX thread */ + if (!opt_bidi || (opt_bidi && !bidi_pass)) { + if (pthread_create(&t1, &attr, worker_testapp_validate, (void *)ifdict[0])) + exit_with_error(errno); + } else if (opt_bidi && bidi_pass) { + /*switch Tx/Rx vectors */ + ifdict[1]->fv.vector = tx; + if (pthread_create(&t1, &attr, worker_testapp_validate, (void *)ifdict[1])) + exit_with_error(errno); + } + + pthread_join(t1, NULL); + pthread_join(t0, NULL); + + if (debug_pkt_dump) { + worker_pkt_dump(); + for (int iter = 0; iter < num_frames - 1; iter++) { + free(pkt_buf[iter]->payload); + free(pkt_buf[iter]); + } + free(pkt_buf); + } + + if (!opt_teardown && !opt_bidi) + print_ksft_result(); +} + +static void testapp_sockets(void) +{ + for (int i = 0; i < (opt_teardown ? MAX_TEARDOWN_ITER : MAX_BIDI_ITER); i++) { + pkt_counter = 0; + prev_pkt = -1; + sigvar = 0; + ksft_print_msg("Creating socket\n"); + testapp_validate(); + opt_bidi ? bidi_pass++ : bidi_pass; + } + + print_ksft_result(); +} + +static void init_iface_config(void *ifaceconfig) +{ + /*Init interface0 */ + ifdict[0]->fv.vector = tx; + memcpy(ifdict[0]->dst_mac, ((struct ifaceconfigobj *)ifaceconfig)->dst_mac, ETH_ALEN); + memcpy(ifdict[0]->src_mac, ((struct ifaceconfigobj *)ifaceconfig)->src_mac, ETH_ALEN); + ifdict[0]->dst_ip = ((struct ifaceconfigobj *)ifaceconfig)->dst_ip.s_addr; + ifdict[0]->src_ip = ((struct ifaceconfigobj *)ifaceconfig)->src_ip.s_addr; + ifdict[0]->dst_port = ((struct ifaceconfigobj *)ifaceconfig)->dst_port; + ifdict[0]->src_port = ((struct ifaceconfigobj *)ifaceconfig)->src_port; + + /*Init interface1 */ + ifdict[1]->fv.vector = rx; + memcpy(ifdict[1]->dst_mac, ((struct ifaceconfigobj *)ifaceconfig)->src_mac, ETH_ALEN); + memcpy(ifdict[1]->src_mac, ((struct ifaceconfigobj *)ifaceconfig)->dst_mac, ETH_ALEN); + ifdict[1]->dst_ip = ((struct ifaceconfigobj *)ifaceconfig)->src_ip.s_addr; + ifdict[1]->src_ip = ((struct ifaceconfigobj *)ifaceconfig)->dst_ip.s_addr; + ifdict[1]->dst_port = ((struct ifaceconfigobj *)ifaceconfig)->src_port; + ifdict[1]->src_port = ((struct ifaceconfigobj *)ifaceconfig)->dst_port; +} + +int main(int argc, char **argv) +{ + struct rlimit _rlim = { RLIM_INFINITY, RLIM_INFINITY }; + + if (setrlimit(RLIMIT_MEMLOCK, &_rlim)) + exit_with_error(errno); + + const char *MAC1 = "\x00\x0A\x56\x9E\xEE\x62"; + const char *MAC2 = "\x00\x0A\x56\x9E\xEE\x61"; + const char *IP1 = "192.168.100.162"; + const char *IP2 = "192.168.100.161"; + u16 UDP_DST_PORT = 2020; + u16 UDP_SRC_PORT = 2121; + + ifaceconfig = (struct ifaceconfigobj *)malloc(sizeof(struct ifaceconfigobj)); + memcpy(ifaceconfig->dst_mac, MAC1, ETH_ALEN); + memcpy(ifaceconfig->src_mac, MAC2, ETH_ALEN); + inet_aton(IP1, &ifaceconfig->dst_ip); + inet_aton(IP2, &ifaceconfig->src_ip); + ifaceconfig->dst_port = UDP_DST_PORT; + ifaceconfig->src_port = UDP_SRC_PORT; + + for (int i = 0; i < MAX_INTERFACES; i++) { + ifdict[i] = (struct ifobject *)malloc(sizeof(struct ifobject)); + if (!ifdict[i]) + exit_with_error(errno); + + ifdict[i]->ifdict_index = i; + } + + setlocale(LC_ALL, ""); + + parse_command_line(argc, argv); + + num_frames = ++opt_pkt_count; + + init_iface_config((void *)ifaceconfig); + + pthread_init_mutex(); + + ksft_set_plan(1); + + if (!opt_teardown && !opt_bidi) { + testapp_validate(); + } else if (opt_teardown && opt_bidi) { + ksft_test_result_fail("ERROR: parameters -T and -B cannot be used together\n"); + ksft_exit_xfail(); + } else { + testapp_sockets(); + } + + for (int i = 0; i < MAX_INTERFACES; i++) + free(ifdict[i]); + + pthread_destroy_mutex(); + + ksft_exit_pass(); + + return 0; +} diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h new file mode 100644 index 000000000000..61f595b6f200 --- /dev/null +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright(c) 2020 Intel Corporation. + */ + +#ifndef XDPXCEIVER_H_ +#define XDPXCEIVER_H_ + +#ifndef SOL_XDP +#define SOL_XDP 283 +#endif + +#ifndef AF_XDP +#define AF_XDP 44 +#endif + +#ifndef PF_XDP +#define PF_XDP AF_XDP +#endif + +#define MAX_INTERFACES 2 +#define MAX_INTERFACE_NAME_CHARS 7 +#define MAX_INTERFACES_NAMESPACE_CHARS 10 +#define MAX_SOCKS 1 +#define MAX_TEARDOWN_ITER 10 +#define MAX_BIDI_ITER 2 +#define PKT_HDR_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ + sizeof(struct udphdr)) +#define MIN_PKT_SIZE 64 +#define ETH_FCS_SIZE 4 +#define PKT_SIZE (MIN_PKT_SIZE - ETH_FCS_SIZE) +#define IP_PKT_SIZE (PKT_SIZE - sizeof(struct ethhdr)) +#define IP_PKT_VER 0x4 +#define IP_PKT_TOS 0x9 +#define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr)) +#define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr)) +#define TMOUT_SEC (3) +#define EOT (-1) +#define USLEEP_MAX 200000 +#define THREAD_STACK 60000000 +#define SOCK_RECONF_CTR 10 +#define BATCH_SIZE 64 +#define POLL_TMOUT 1000 +#define NEED_WAKEUP true + +typedef __u32 u32; +typedef __u16 u16; +typedef __u8 u8; + +enum TESTS { + ORDER_CONTENT_VALIDATE_XDP_SKB = 0, + ORDER_CONTENT_VALIDATE_XDP_DRV = 1, +}; + +u8 uut; +u8 debug_pkt_dump; +u32 num_frames; +u8 switching_notify; +u8 bidi_pass; + +static u32 opt_xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; +static int opt_queue; +static int opt_pkt_count; +static int opt_poll; +static int opt_teardown; +static int opt_bidi; +static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; +static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE]; +static u32 pkt_counter; +static u32 prev_pkt = -1; +static int sigvar; + +struct xsk_umem_info { + struct xsk_ring_prod fq; + struct xsk_ring_cons cq; + struct xsk_umem *umem; + void *buffer; +}; + +struct xsk_socket_info { + struct xsk_ring_cons rx; + struct xsk_ring_prod tx; + struct xsk_umem_info *umem; + struct xsk_socket *xsk; + unsigned long rx_npkts; + unsigned long tx_npkts; + unsigned long prev_rx_npkts; + unsigned long prev_tx_npkts; + u32 outstanding_tx; +}; + +struct flow_vector { + enum fvector { + tx, + rx, + bidi, + undef, + } vector; +}; + +struct generic_data { + u32 seqnum; +}; + +struct ifaceconfigobj { + u8 dst_mac[ETH_ALEN]; + u8 src_mac[ETH_ALEN]; + struct in_addr dst_ip; + struct in_addr src_ip; + u16 src_port; + u16 dst_port; +} *ifaceconfig; + +struct ifobject { + int ifindex; + int ifdict_index; + char ifname[MAX_INTERFACE_NAME_CHARS]; + char nsname[MAX_INTERFACES_NAMESPACE_CHARS]; + struct flow_vector fv; + struct xsk_socket_info *xsk; + struct xsk_umem_info *umem; + u8 dst_mac[ETH_ALEN]; + u8 src_mac[ETH_ALEN]; + u32 dst_ip; + u32 src_ip; + u16 src_port; + u16 dst_port; +}; + +static struct ifobject *ifdict[MAX_INTERFACES]; + +/*threads*/ +atomic_int spinning_tx; +atomic_int spinning_rx; +pthread_mutex_t sync_mutex; +pthread_mutex_t sync_mutex_tx; +pthread_cond_t signal_rx_condition; +pthread_cond_t signal_tx_condition; +pthread_t t0, t1, ns_thread; +pthread_attr_t attr; + +struct targs { + bool retptr; + int idx; +}; + +TAILQ_HEAD(head_s, pkt) head = TAILQ_HEAD_INITIALIZER(head); +struct head_s *head_p; +struct pkt { + char *pkt_frame; + + TAILQ_ENTRY(pkt) pkt_nodes; +} *pkt_node_rx, *pkt_node_rx_q; + +struct pkt_frame { + char *payload; +} *pkt_obj; + +struct pkt_frame **pkt_buf; + +#endif /* XDPXCEIVER_H */ diff --git a/tools/testing/selftests/bpf/xsk_prereqs.sh b/tools/testing/selftests/bpf/xsk_prereqs.sh new file mode 100755 index 000000000000..9d54c4645127 --- /dev/null +++ b/tools/testing/selftests/bpf/xsk_prereqs.sh @@ -0,0 +1,135 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright(c) 2020 Intel Corporation. + +ksft_pass=0 +ksft_fail=1 +ksft_xfail=2 +ksft_xpass=3 +ksft_skip=4 + +GREEN='\033[0;92m' +YELLOW='\033[0;93m' +RED='\033[0;31m' +NC='\033[0m' +STACK_LIM=131072 +SPECFILE=veth.spec +XSKOBJ=xdpxceiver +NUMPKTS=10000 + +validate_root_exec() +{ + msg="skip all tests:" + if [ $UID != 0 ]; then + echo $msg must be run as root >&2 + test_exit $ksft_fail 2 + else + return $ksft_pass + fi +} + +validate_veth_support() +{ + msg="skip all tests:" + if [ $(ip link add $1 type veth 2>/dev/null; echo $?;) != 0 ]; then + echo $msg veth kernel support not available >&2 + test_exit $ksft_skip 1 + else + ip link del $1 + return $ksft_pass + fi +} + +validate_veth_spec_file() +{ + if [ ! -f ${SPECFILE} ]; then + test_exit $ksft_skip 1 + fi +} + +test_status() +{ + statusval=$1 + if [ -n "${colorconsole+set}" ]; then + if [ $statusval -eq 2 ]; then + echo -e "${YELLOW}$2${NC}: [ ${RED}FAIL${NC} ]" + elif [ $statusval -eq 1 ]; then + echo -e "${YELLOW}$2${NC}: [ ${RED}SKIPPED${NC} ]" + elif [ $statusval -eq 0 ]; then + echo -e "${YELLOW}$2${NC}: [ ${GREEN}PASS${NC} ]" + fi + else + if [ $statusval -eq 2 ]; then + echo -e "$2: [ FAIL ]" + elif [ $statusval -eq 1 ]; then + echo -e "$2: [ SKIPPED ]" + elif [ $statusval -eq 0 ]; then + echo -e "$2: [ PASS ]" + fi + fi +} + +test_exit() +{ + retval=$1 + if [ $2 -ne 0 ]; then + test_status $2 $(basename $0) + fi + exit $retval +} + +clear_configs() +{ + if [ $(ip netns show | grep $3 &>/dev/null; echo $?;) == 0 ]; then + [ $(ip netns exec $3 ip link show $2 &>/dev/null; echo $?;) == 0 ] && + { echo "removing link $1:$2"; ip netns exec $3 ip link del $2; } + echo "removing ns $3" + ip netns del $3 + fi + #Once we delete a veth pair node, the entire veth pair is removed, + #this is just to be cautious just incase the NS does not exist then + #veth node inside NS won't get removed so we explicitly remove it + [ $(ip link show $1 &>/dev/null; echo $?;) == 0 ] && + { echo "removing link $1"; ip link del $1; } + if [ -f ${SPECFILE} ]; then + echo "removing spec file:" ${SPECFILE} + rm -f ${SPECFILE} + fi +} + +cleanup_exit() +{ + echo "cleaning up..." + clear_configs $1 $2 $3 +} + +validate_ip_utility() +{ + [ ! $(type -P ip) ] && { echo "'ip' not found. Skipping tests."; test_exit $ksft_skip 1; } +} + +vethXDPgeneric() +{ + ip link set dev $1 xdpdrv off + ip netns exec $3 ip link set dev $2 xdpdrv off +} + +vethXDPnative() +{ + ip link set dev $1 xdpgeneric off + ip netns exec $3 ip link set dev $2 xdpgeneric off +} + +execxdpxceiver() +{ + local -a 'paramkeys=("${!'"$1"'[@]}")' copy + paramkeysstr=${paramkeys[*]} + + for index in $paramkeysstr; + do + current=$1"[$index]" + copy[$index]=${!current} + done + + ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} ${copy[*]} -C ${NUMPKTS} +} diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 0eae628d1ffd..9aa9624cff97 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -264,27 +264,37 @@ do_transfer() cpid=$! if [ $rm_nr_ns1 -gt 0 ]; then - counter=1 - sleep 1 + if [ $rm_nr_ns1 -lt 8 ]; then + counter=1 + sleep 1 - while [ $counter -le $rm_nr_ns1 ] - do - ip netns exec ${listener_ns} ./pm_nl_ctl del $counter + while [ $counter -le $rm_nr_ns1 ] + do + ip netns exec ${listener_ns} ./pm_nl_ctl del $counter + sleep 1 + let counter+=1 + done + else sleep 1 - let counter+=1 - done + ip netns exec ${listener_ns} ./pm_nl_ctl flush + fi fi if [ $rm_nr_ns2 -gt 0 ]; then - counter=1 - sleep 1 + if [ $rm_nr_ns2 -lt 8 ]; then + counter=1 + sleep 1 - while [ $counter -le $rm_nr_ns2 ] - do - ip netns exec ${connector_ns} ./pm_nl_ctl del $counter + while [ $counter -le $rm_nr_ns2 ] + do + ip netns exec ${connector_ns} ./pm_nl_ctl del $counter + sleep 1 + let counter+=1 + done + else sleep 1 - let counter+=1 - done + ip netns exec ${connector_ns} ./pm_nl_ctl flush + fi fi wait $cpid @@ -663,6 +673,18 @@ chk_join_nr "remove subflows and signal" 3 3 3 chk_add_nr 1 1 chk_rm_nr 2 2 +# subflows and signal, flush +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 3 +ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal +ip netns exec $ns2 ./pm_nl_ctl limits 1 3 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 0 8 8 slow +chk_join_nr "flush subflows and signal" 3 3 3 +chk_add_nr 1 1 +chk_rm_nr 2 2 + # subflow IPv6 reset ip netns exec $ns1 ./pm_nl_ctl limits 0 1 diff --git a/tools/testing/selftests/net/test_vxlan_under_vrf.sh b/tools/testing/selftests/net/test_vxlan_under_vrf.sh index 09f9ed92cbe4..534c8b7699ab 100755 --- a/tools/testing/selftests/net/test_vxlan_under_vrf.sh +++ b/tools/testing/selftests/net/test_vxlan_under_vrf.sh @@ -50,7 +50,7 @@ cleanup() { ip link del veth-tap 2>/dev/null || true for ns in hv-1 hv-2 vm-1 vm-2; do - ip netns del $ns || true + ip netns del $ns 2>/dev/null || true done } |